# Comparing Sources: MediaCloud Collections vs. Local News Initiative

In [241]:
import pandas as pd
import requests
import mcmetadata
import os

In [242]:
def clean_url(url):
    url = url.replace('https://', '').replace('http://', '')
    if url.startswith('www.'):
        url = url[4:]
    url = url.rstrip('/')
    return url

In [243]:
def download_file_from_google_drive(file_id, output_filename):
    url = f"https://drive.google.com/uc?id={file_id}"
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(output_filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully as {output_filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

### Download files from Google Drive

In [244]:
lni_sources_file="tmp/lni_sources.json"
download_file_from_google_drive("1iHHZ6H9er6MICozbC1pCuA-6gTE5SIIm", lni_sources_file)

File downloaded successfully as tmp/lni_sources.json


In [245]:
# We had to split Mediacloud sources to download them from Google Drive easily
mediacloud_sources_files = [
    "1ED1INjhodMV1jZeFKFMrphjE0K5wR5mE",
    "1ck1uGpr-J4NdwPAr-wnPiU6gGViI_Zfk",
    "1853yFlD-GupaxU6xuG0UJsMFn4w11f0N",
    "1C2BH2HSx30qsQTBZNIO2odEakwSgyCeq",
    "1oTJfWa1meZbEJoIC4EVGh935wv-OpVzb",
]
for index, mediacloud_sources_file in enumerate(mediacloud_sources_files):
    download_file_from_google_drive(mediacloud_sources_file, f"tmp/mediacloud_sources_part_{index}.csv")

File downloaded successfully as tmp/mediacloud_sources_part_0.csv
File downloaded successfully as tmp/mediacloud_sources_part_1.csv
File downloaded successfully as tmp/mediacloud_sources_part_2.csv
File downloaded successfully as tmp/mediacloud_sources_part_3.csv
File downloaded successfully as tmp/mediacloud_sources_part_4.csv


### Read the downloaded files

In [246]:
lni_df = pd.read_json(lni_sources_file)

In [247]:
dataframes = []
for i in range(5):
    file = f"tmp/mediacloud_sources_part_{i}.csv"
    print("Reading "+file)
    if os.path.exists(file):
        df = pd.read_csv(file)
        dataframes.append(df)
    else:
        print(f"File not found: {file}")
mediacloud_df = pd.concat(dataframes, ignore_index=True)

Reading tmp/mediacloud_sources_part_0.csv
Reading tmp/mediacloud_sources_part_1.csv
Reading tmp/mediacloud_sources_part_2.csv
Reading tmp/mediacloud_sources_part_3.csv
Reading tmp/mediacloud_sources_part_4.csv


### Get sources that were scraped from `The Local News Initiative` but not present in MediaCloud directory by using Canonical Domain

#### Add canonical_domain and clean_url fields to the LNI dataset that will be used for comparison with MediaCloud sources

In [248]:
lni_df["canonical_domain"] = lni_df["url"].apply(mcmetadata.urls.canonical_domain)
lni_df["clean_url"] = lni_df["url"].apply(clean_url)

In [249]:
lni_df.head()

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url,canonical_domain,clean_url
0,1,VA,Washington,ADJ Bristol Herald Courier,Newspaper,2024,51191,heraldcourier.com,heraldcourier.com,heraldcourier.com
1,2,VA,Augusta,ADJ The News Leader,Newspaper,2024,51015,www.newsleader.com,newsleader.com,newsleader.com
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...,augustafreepress.com,augustafreepress.com/news/sheriff-20k-in-coins...
3,4,NC,Madison,News-Record and Sentinel,Newspaper,2024,37115,www.citizen-times.com/local/news-record-and-se...,citizen-times.com,citizen-times.com/local/news-record-and-sentinel
4,5,WI,Wood,Marshfield News-Herald,Newspaper,2024,55141,www.marshfieldnewsherald.com,marshfieldnewsherald.com,marshfieldnewsherald.com


#### Add clean_url field to MediaCloud sources dataset

In [250]:
mediacloud_df.loc[:, "clean_url"] = mediacloud_df["homepage"].apply(clean_url)

In [291]:
mediacloud_df.head(1000)

Unnamed: 0,source_id,name,label,homepage,clean_url
0,1595389,lyvinthenet.wordpress.com,lyvinthenet.wordpress.com,http://lyvinthenet.wordpress.com/,lyvinthenet.wordpress.com
1,1595390,artificialintelligencemania.com,artificialintelligencemania.com,http://artificialintelligencemania.com/,artificialintelligencemania.com
2,1595391,jhrogue.blogspot.com,jhrogue.blogspot.com,http://jhrogue.blogspot.com/,jhrogue.blogspot.com
3,1595392,teach-ict.co,teach-ict.co,http://teach-ict.co/,teach-ict.co
4,1595393,bbb2022.com,bbb2022.com,http://bbb2022.com/,bbb2022.com
...,...,...,...,...,...
995,1596610,dieseltykes.com,dieseltykes.com,http://dieseltykes.com/,dieseltykes.com
996,1596611,measureaustin.org,measureaustin.org,http://measureaustin.org/,measureaustin.org
997,1596612,baylaurelfund.com,baylaurelfund.com,http://baylaurelfund.com/,baylaurelfund.com
998,1596613,bearracuda.com,bearracuda.com,http://bearracuda.com/,bearracuda.com


#### Make the comparison between the two dataframes using `clean_url` (in MediaCloud sources) and `canonical domain` (in LNI sources)
When we make this comparison we will loose all sources from patch.com, thus we need to make another comparison between the sources that are coming from patch.com using `clean_url` only

In [313]:
mediacloud_urls = set(mediacloud_df['clean_url'])
mask = ~lni_df['canonical_domain'].isin(mediacloud_urls)
filtered_df = lni_df[mask].copy()
filtered_df.shape

(2222, 10)

#### Get sources from "patch.com", these need special processing, if we use canonical domain for comparison then we will loose a lot of data as all sources will have one domain i.e. patch.com

In [314]:
patch_lni_df = lni_df[lni_df['url'].str.contains('patch.com/', na=False)]
patch_lni_df.shape

(545, 10)

In [315]:
patch_mediacloud_df = mediacloud_df[mediacloud_df['homepage'].str.contains('://patch.com', na=False)]
patch_mediacloud_df.shape

(271, 5)

In [318]:
mediacloud_urls = set(patch_mediacloud_df['clean_url'])
mask = ~_patch_lni_df['clean_url'].isin(mediacloud_urls)
_filtered_df = patch_lni_df[mask].copy()
_filtered_df

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url,canonical_domain,clean_url
320,321,MD,Baltimore,Arbutus Times,Newspaper,2024,24005,patch.com/maryland/arbutus,patch.com,patch.com/maryland/arbutus
359,360,IL,Lake,Highland Park News,Newspaper,2024,17097,patch.com/illinois/highlandpark,patch.com,patch.com/illinois/highlandpark
390,391,IL,Cook,Winnetka Talk,Newspaper,2024,17031,patch.com/illinois/winnetka,patch.com,patch.com/illinois/winnetka
416,417,PA,Montgomery,Phoenix Reporter and Item,Newspaper,2024,42091,patch.com/pennsylvania/lansdale/police-fire,patch.com,patch.com/pennsylvania/lansdale/police-fire
500,501,NY,Nassau,Port Washington News,Newspaper,2024,36059,patch.com/new-york/portwashington,patch.com,patch.com/new-york/portwashington
...,...,...,...,...,...,...,...,...,...,...
6981,6982,CT,Capitol,Patch Wethersfield,Network Sites,2024,9110,patch.com/connecticut/wethersfield?page=2,patch.com,patch.com/connecticut/wethersfield?page=2
6982,6983,CT,Capitol,Patch Windsor,Network Sites,2024,9110,patch.com/connecticut/windsor,patch.com,patch.com/connecticut/windsor
6983,6984,CT,Capitol,Patch Windsor Locks-East Windsor,Network Sites,2024,9110,patch.com/connecticut/windsorlocks,patch.com,patch.com/connecticut/windsorlocks
6984,6985,MD,Baltimore City,Patch Baltimore,Network Sites,2024,24510,patch.com/maryland/baltimore,patch.com,patch.com/maryland/baltimore


### Export the content to CSV

#### Exporting sources from LNI not present in the MediaCloud directory

In [349]:
df_combined = pd.concat([_filtered_df, filtered_df], ignore_index=True)
# Dropping any duplicate sources
df_combined = df_combined.drop_duplicates(subset='clean_url', keep='first')
df_combined.to_csv('tmp/sources_in_lni_not_in_mc_updated.csv', index=False)
df_combined.shape

(2462, 10)

#### Exporting all sources from LNI

In [351]:
lni_df.head()

Unnamed: 0,id,state,county,media_name,media_type,year_loaded,fips,url,canonical_domain,clean_url
0,1,VA,Washington,ADJ Bristol Herald Courier,Newspaper,2024,51191,heraldcourier.com,heraldcourier.com,heraldcourier.com
1,2,VA,Augusta,ADJ The News Leader,Newspaper,2024,51015,www.newsleader.com,newsleader.com,newsleader.com
2,3,VA,Augusta,ADJ Augusta Free Press,Newspaper,2024,51015,augustafreepress.com/news/sheriff-20k-in-coins...,augustafreepress.com,augustafreepress.com/news/sheriff-20k-in-coins...
3,4,NC,Madison,News-Record and Sentinel,Newspaper,2024,37115,www.citizen-times.com/local/news-record-and-se...,citizen-times.com,citizen-times.com/local/news-record-and-sentinel
4,5,WI,Wood,Marshfield News-Herald,Newspaper,2024,55141,www.marshfieldnewsherald.com,marshfieldnewsherald.com,marshfieldnewsherald.com
