# Comparing Sources: MediaCloud Geographic Online News Collections vs. Local News Initiative

In [2]:
import pandas as pd
import requests
import mcmetadata

In [8]:
def download_file_from_google_drive(file_id, output_filename):
    url = f"https://drive.google.com/uc?id={file_id}"
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(output_filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully as {output_filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

In [16]:
mediacloud_sources_file="mediacloud_sources.json"
download_file_from_google_drive("1j53R2Zwx8al1YSFJ2VahTBrGVQbwxcJz", mediacloud_sources_file)

File downloaded successfully as mediacloud_sources.json


In [36]:
lni_sources_file="lni_sources.json"
download_file_from_google_drive("1iHHZ6H9er6MICozbC1pCuA-6gTE5SIIm", lni_sources_file)

File downloaded successfully as lni_sources.json


In [37]:
lni_df = pd.read_json(lni_sources_file)

In [38]:
mediacloud_df = pd.read_json(mediacloud_sources_file)
mediacloud_local_sources_df = mediacloud_df[mediacloud_df['pub_country'] == 'USA']

### Get sources that were scraped from `The Local News Initiative` but not present in MediaCloud directory by using Source Names

In [39]:
mediacloud_source_names = set(mediacloud_df['label'])
mask = ~lni_df['media_name'].isin(mediacloud_source_names)
filtered_df = lni_df[mask].copy()
filtered_df

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url
0,1,VA,Washington,ADJ Bristol Herald Courier,Newspaper,2024,51191,heraldcourier.com
1,2,VA,Augusta,ADJ The News Leader,Newspaper,2024,51015,www.newsleader.com
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...
3,4,NC,Madison,News-Record and Sentinel,Newspaper,2024,37115,www.citizen-times.com/local/news-record-and-se...
4,5,WI,Wood,Marshfield News-Herald,Newspaper,2024,55141,www.marshfieldnewsherald.com
...,...,...,...,...,...,...,...,...
7977,7978,AK,Anchorage,KSKA 91.1,Public Broadcasting,2024,2020,radiostationusa.fm/online/kska
7978,7979,AK,Bethel,KYUK 90.3,Public Broadcasting,2024,2050,www.kyuk.org
7979,7980,PA,Allegheny,WESA 90.5,Public Broadcasting,2024,42003,www.wesa.fm
7980,7981,PA,Northampton,WLVR 91.3,Public Broadcasting,2024,42095,www.wlvr.org


In [40]:
lni_df

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url
0,1,VA,Washington,ADJ Bristol Herald Courier,Newspaper,2024,51191,heraldcourier.com
1,2,VA,Augusta,ADJ The News Leader,Newspaper,2024,51015,www.newsleader.com
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...
3,4,NC,Madison,News-Record and Sentinel,Newspaper,2024,37115,www.citizen-times.com/local/news-record-and-se...
4,5,WI,Wood,Marshfield News-Herald,Newspaper,2024,55141,www.marshfieldnewsherald.com
...,...,...,...,...,...,...,...,...
7977,7978,AK,Anchorage,KSKA 91.1,Public Broadcasting,2024,2020,radiostationusa.fm/online/kska
7978,7979,AK,Bethel,KYUK 90.3,Public Broadcasting,2024,2050,www.kyuk.org
7979,7980,PA,Allegheny,WESA 90.5,Public Broadcasting,2024,42003,www.wesa.fm
7980,7981,PA,Northampton,WLVR 91.3,Public Broadcasting,2024,42095,www.wlvr.org


### Get sources that were scraped from `The Local News Initiative` but not present in MediaCloud directory by using Canonical Domain

In [48]:
lni_df["canonical_domain"] = lni_df["url"].apply(mcmetadata.urls.canonical_domain)

In [51]:
mediacloud_urls = set(mediacloud_df['name'])
mask = ~lni_df['canonical_domain'].isin(mediacloud_urls)
filtered_df = lni_df[mask].copy()
filtered_df

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url,canonical_domain
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...,augustafreepress.com
6,7,IA,Pocahontas,Laurens Sun,Newspaper,2024,19151,laurens.advantage-preservation.com,advantage-preservation.com
10,11,IL,DeKalb,Sandwich Record,Newspaper,2024,17037,ldsgenealogy.com/IL/Sandwich.htm,ldsgenealogy.com
22,23,MO,DeKalb,DeKalb County Record-Herald,Newspaper,2024,29063,www.newspapers.com/paper/the-dekalb-county-rec...,newspapers.com
29,30,WA,Columbia,Dayton Chronicle,Newspaper,2024,53013,www.daytonchronicle.com,daytonchronicle.com
...,...,...,...,...,...,...,...,...,...
7975,7976,IL,McLean,WGLT 89.1,Public Broadcasting,2024,17113,www.wglt.org,wglt.org
7976,7977,LA,East Baton Rouge,WRKF 89.3,Public Broadcasting,2024,22033,www.wrkf.org,wrkf.org
7977,7978,AK,Anchorage,KSKA 91.1,Public Broadcasting,2024,2020,radiostationusa.fm/online/kska,radiostationusa.fm
7979,7980,PA,Allegheny,WESA 90.5,Public Broadcasting,2024,42003,www.wesa.fm,wesa.fm


### Summary of Sources by county

In [52]:
county_counts = filtered_df.groupby('county').size()

In [53]:
for county, count in county_counts.items():
    print(f"{county} - [{count}]")

Abbeville - [1]
Acadia - [1]
Ada - [4]
Adair - [3]
Adams - [11]
Alachua - [1]
Alamance - [2]
Alameda - [17]
Alamosa - [1]
Albany - [1]
Aleutians West - [1]
Alger - [1]
Allamakee - [1]
Allegan - [2]
Alleghany - [1]
Allegheny - [9]
Allen - [4]
Amador - [1]
Amite - [1]
Anchorage - [6]
Anderson - [3]
Andrew - [1]
Anne Arundel - [2]
Anoka - [1]
Antelope - [2]
Aransas - [1]
Arapahoe - [2]
Arlington - [1]
Armstrong - [1]
Arthur - [1]
Ashland - [3]
Ashley - [1]
Ashtabula - [2]
Assumption - [1]
Atchison - [1]
Athens - [2]
Atkinson - [1]
Atlantic - [3]
Augusta - [2]
Aurora - [1]
Austin - [3]
Avery - [1]
Baker - [1]
Baldwin - [2]
Baltimore - [2]
Baltimore City - [7]
Baraga - [1]
Barber - [1]
Barbour - [1]
Barnes - [1]
Barnstable - [3]
Barnwell - [1]
Barron - [2]
Barry - [2]
Bartholomew - [1]
Barton - [1]
Bates - [1]
Bath - [1]
Bay - [1]
Beaufort - [1]
Beauregard - [1]
Beaver - [1]
Becker - [1]
Beckham - [1]
Bedford - [2]
Bee - [1]
Belknap - [3]
Bell - [2]
Beltrami - [2]
Ben Hill - [1]
Benton - [8