# Comparing Sources: MediaCloud Geographic Online News Collections vs. Local News Initiative

In [18]:
import pandas as pd
import requests
import mcmetadata

In [19]:
def download_file_from_google_drive(file_id, output_filename):
    url = f"https://drive.google.com/uc?id={file_id}"
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(output_filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully as {output_filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

In [20]:
mediacloud_sources_file="mediacloud_sources.json"
download_file_from_google_drive("1j53R2Zwx8al1YSFJ2VahTBrGVQbwxcJz", mediacloud_sources_file)

File downloaded successfully as mediacloud_sources.json


In [21]:
lni_sources_file="lni_sources.json"
download_file_from_google_drive("1iHHZ6H9er6MICozbC1pCuA-6gTE5SIIm", lni_sources_file)

File downloaded successfully as lni_sources.json


In [22]:
lni_df = pd.read_json(lni_sources_file)

In [23]:
mediacloud_df = pd.read_json(mediacloud_sources_file)
mediacloud_local_sources_df = mediacloud_df[mediacloud_df['pub_country'] == 'USA']

In [24]:
mediacloud_local_sources_df.to_csv('mediacloud_records.csv', index=False)

In [25]:
mediacloud_local_sources_df

Unnamed: 0,id,name,url_search_string,label,homepage,notes,platform,stories_per_week,first_story,created_at,modified_at,pub_country,pub_state,primary_language,media_type,collection_count
501,1,nytimes.com,,New York Times,http://nytimes.com,,online_news,1245.0,,2022-12-23 17:43:28.547804+00:00,2024-11-11 16:45:16.975609+00:00,USA,US-NY,,print_native,45
560,20410,livescience.com,,livescience.com,http://www.livescience.com,,online_news,55.0,,2022-12-23 17:43:28.547804+00:00,2024-11-11 16:48:19.629226+00:00,USA,US-NY,en,digital_native,8
588,375824,francecourtoise.info,,|,http://www.francecourtoise.info/,,online_news,,,2022-12-23 17:43:28.547804+00:00,2024-06-12 20:36:06.483103+00:00,USA,US-TX,fr,video_broadcast,6
2467,375824,francecourtoise.info,,|,http://www.francecourtoise.info/,,online_news,,,2022-12-23 17:43:28.547804+00:00,2024-06-12 20:36:06.483103+00:00,USA,US-TX,fr,video_broadcast,6
4756,375824,francecourtoise.info,,|,http://www.francecourtoise.info/,,online_news,,,2022-12-23 17:43:28.547804+00:00,2024-06-12 20:36:06.483103+00:00,USA,US-TX,fr,video_broadcast,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49555,96117,codyenterprise.com,,codyenterprise.com,http://www.codyenterprise.com/#spider,,online_news,,,2022-12-23 17:43:28.547804+00:00,2024-06-12 20:34:58.955284+00:00,USA,US-WY,en,print_native,3
49556,99235,powelltribune.com,,powelltribune.com,http://www.powelltribune.com/#spider,,online_news,99.0,,2022-12-23 17:43:28.547804+00:00,2024-11-11 16:54:44.569688+00:00,USA,US-WY,en,print_native,3
49557,133329,bighornmountainradio.com,,bighornmountainradio.com,http://bighornmountainradio.com/,,online_news,11.0,,2022-12-23 17:43:28.547804+00:00,2024-11-11 16:47:44.499507+00:00,USA,US-WY,en,video_broadcast,3
49558,143940,planetjh.com,,planetjh.com,http://planetjh.com/,,online_news,,,2022-12-23 17:43:28.547804+00:00,2024-06-12 20:35:01.717035+00:00,USA,US-WY,en,print_native,3


### Get sources that were scraped from `The Local News Initiative` but not present in MediaCloud directory by using Source Names

In [26]:
mediacloud_source_names = set(mediacloud_df['label'])
mask = ~lni_df['media_name'].isin(mediacloud_source_names)
filtered_df = lni_df[mask].copy()
filtered_df

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url
0,1,VA,Washington,ADJ Bristol Herald Courier,Newspaper,2024,51191,heraldcourier.com
1,2,VA,Augusta,ADJ The News Leader,Newspaper,2024,51015,www.newsleader.com
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...
3,4,NC,Madison,News-Record and Sentinel,Newspaper,2024,37115,www.citizen-times.com/local/news-record-and-se...
4,5,WI,Wood,Marshfield News-Herald,Newspaper,2024,55141,www.marshfieldnewsherald.com
...,...,...,...,...,...,...,...,...
7977,7978,AK,Anchorage,KSKA 91.1,Public Broadcasting,2024,2020,radiostationusa.fm/online/kska
7978,7979,AK,Bethel,KYUK 90.3,Public Broadcasting,2024,2050,www.kyuk.org
7979,7980,PA,Allegheny,WESA 90.5,Public Broadcasting,2024,42003,www.wesa.fm
7980,7981,PA,Northampton,WLVR 91.3,Public Broadcasting,2024,42095,www.wlvr.org


In [27]:
lni_df

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url
0,1,VA,Washington,ADJ Bristol Herald Courier,Newspaper,2024,51191,heraldcourier.com
1,2,VA,Augusta,ADJ The News Leader,Newspaper,2024,51015,www.newsleader.com
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...
3,4,NC,Madison,News-Record and Sentinel,Newspaper,2024,37115,www.citizen-times.com/local/news-record-and-se...
4,5,WI,Wood,Marshfield News-Herald,Newspaper,2024,55141,www.marshfieldnewsherald.com
...,...,...,...,...,...,...,...,...
7977,7978,AK,Anchorage,KSKA 91.1,Public Broadcasting,2024,2020,radiostationusa.fm/online/kska
7978,7979,AK,Bethel,KYUK 90.3,Public Broadcasting,2024,2050,www.kyuk.org
7979,7980,PA,Allegheny,WESA 90.5,Public Broadcasting,2024,42003,www.wesa.fm
7980,7981,PA,Northampton,WLVR 91.3,Public Broadcasting,2024,42095,www.wlvr.org


### Get sources that were scraped from `The Local News Initiative` but not present in MediaCloud directory by using Canonical Domain

In [56]:
lni_df["canonical_domain"] = lni_df["url"].apply(mcmetadata.urls.canonical_domain)

In [57]:
mediacloud_urls = set(mediacloud_df['name'])
mask = ~lni_df['canonical_domain'].isin(mediacloud_urls)
filtered_df = lni_df[mask].copy()
filtered_df

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url,canonical_domain
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...,augustafreepress.com
5,6,IA,Buena Vista,Buena Vista County Journal,Newspaper,2024,19021,buenavistacounty.iowa.gov,iowa.gov
6,7,IA,Pocahontas,Laurens Sun,Newspaper,2024,19151,laurens.advantage-preservation.com,advantage-preservation.com
7,8,IA,Pocahontas,Pocahontas Record-Democrat,Newspaper,2024,19151,newspaperarchive.com/search/location/us/ia/poc...,newspaperarchive.com
10,11,IL,DeKalb,Sandwich Record,Newspaper,2024,17037,ldsgenealogy.com/IL/Sandwich.htm,ldsgenealogy.com
...,...,...,...,...,...,...,...,...,...
7975,7976,IL,McLean,WGLT 89.1,Public Broadcasting,2024,17113,www.wglt.org,wglt.org
7976,7977,LA,East Baton Rouge,WRKF 89.3,Public Broadcasting,2024,22033,www.wrkf.org,wrkf.org
7977,7978,AK,Anchorage,KSKA 91.1,Public Broadcasting,2024,2020,radiostationusa.fm/online/kska,radiostationusa.fm
7979,7980,PA,Allegheny,WESA 90.5,Public Broadcasting,2024,42003,www.wesa.fm,wesa.fm


### Domains that have appeared more than once after filtering

In [59]:
canonical_domains = filtered_df.groupby('canonical_domain').size().sort_values(ascending=False)
for canonical_domain, count in canonical_domains.items():
    if count > 1:
        print(canonical_domain, count)

loc.gov 86
newspapers.com 85
axios.com 30
newsbreak.com 29
ground.news 25
ontvtonight.com 21
mainstreetnewsgroup.com 20
onlineradiobox.com 15
youtube.com 15
thepaperboy.com 15
advantage-preservation.com 14
suburbantribune.com 13
legacy.com 12
radiostationusa.fm 12
mainstreetmediatn.com 11
zillow.com 11
yelp.com 10
weremember.com 10
ldsgenealogy.com 9
unt.edu 9
chalkbeat.org 8
apnews.com 8
nyshistoricnewspapers.org 8
southtexasnews.com 7
lvpnews.com 7
tvpassport.com 7
ctinsider.com 7
vtcng.com 6
history.com 6
southeastiowaunion.com 6
pmg-ky1.com 6
mapcarta.com 6
midfloridanewspapers.com 6
sdna.com 6
okhistory.org 6
hometownnewsvolusia.com 5
nordaknorth.com 5
technical.ly 5
uscourts.gov 5
examinerpublications.com 5
southwestmessengerpress.com 5
hometownnewsbrevard.com 5
secondwavemedia.com 5
ndna.com 5
streetsblog.org 5
pmg-sc.com 5
pmg-ky2.com 5
rfdnewsgroup.com 4
sunrise-sunset.org 4
niche.com 4
communityimpact.com 4
511pa.com 4
appenmedia.com 4
mnhs.org 4
radio.net 4
timeanddate.com 4

### Summary of sources per county

In [60]:
county_counts = filtered_df.groupby('county').size()
for county, count in county_counts.items():
    print(f"{county} - [{count}]")

Abbeville - [1]
Acadia - [2]
Ada - [4]
Adair - [3]
Adams - [13]
Aitkin - [1]
Alachua - [1]
Alamance - [2]
Alameda - [19]
Alamosa - [1]
Albany - [1]
Albemarle - [1]
Aleutians West - [1]
Alger - [1]
Allamakee - [1]
Allegan - [3]
Allegany - [1]
Alleghany - [1]
Allegheny - [11]
Allen - [5]
Amador - [1]
Amite - [1]
Anchorage - [6]
Anderson - [3]
Andrew - [1]
Anne Arundel - [3]
Anoka - [1]
Antelope - [2]
Aransas - [1]
Arapahoe - [4]
Arlington - [1]
Armstrong - [2]
Aroostook - [2]
Arthur - [1]
Ashland - [3]
Ashley - [1]
Ashtabula - [3]
Assumption - [1]
Atchison - [2]
Athens - [2]
Atkinson - [1]
Atlantic - [3]
Augusta - [2]
Aurora - [3]
Austin - [3]
Avery - [1]
Baca - [1]
Baker - [1]
Baldwin - [3]
Baltimore - [4]
Baltimore City - [7]
Bamberg - [1]
Baraga - [1]
Barber - [1]
Barbour - [2]
Barnes - [1]
Barnstable - [5]
Barnwell - [1]
Barron - [2]
Barry - [2]
Bartholomew - [1]
Barton - [2]
Bastrop - [1]
Bates - [1]
Bath - [2]
Bay - [1]
Beaufort - [1]
Beauregard - [1]
Beaver - [1]
Becker - [1]
Beck