# Comparing Sources: MediaCloud News Collections vs. thepaperboy.com

In [8]:
import pandas as pd
import requests
import mcmetadata
import json
import os

In [2]:
def download_file_from_google_drive(file_id, output_filename):
    url = f"https://drive.google.com/uc?id={file_id}"
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(output_filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully as {output_filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

In [3]:
def extract_social_media_urls(social_media_json):
    try:
        if social_media_json:
            urls = [list(item.values())[0] for item in social_media_json]
        return '; '.join(urls)
    except:
        pass
    return ''

In [4]:
the_paperboy_sources_file="tmp/the_paperboy_sources.json"
download_file_from_google_drive("1VViBh3TnpNqgh5zyYu58iuQbA1nuEGOZ",the_paperboy_sources_file)

File downloaded successfully as tmp/the_paperboy_sources.json


In [5]:
# We have to split Mediacloud sources to easily download them from Google Drive
mediacloud_sources_files = [
    "1ED1INjhodMV1jZeFKFMrphjE0K5wR5mE",
    "1ck1uGpr-J4NdwPAr-wnPiU6gGViI_Zfk",
    "1853yFlD-GupaxU6xuG0UJsMFn4w11f0N",
    "1C2BH2HSx30qsQTBZNIO2odEakwSgyCeq",
    "1oTJfWa1meZbEJoIC4EVGh935wv-OpVzb",
]
for index, mediacloud_sources_file in enumerate(mediacloud_sources_files):
    download_file_from_google_drive(mediacloud_sources_file, f"tmp/mediacloud_sources_part_{index}.csv")

File downloaded successfully as tmp/mediacloud_sources_part_0.csv
File downloaded successfully as tmp/mediacloud_sources_part_1.csv
File downloaded successfully as tmp/mediacloud_sources_part_2.csv
File downloaded successfully as tmp/mediacloud_sources_part_3.csv
File downloaded successfully as tmp/mediacloud_sources_part_4.csv


In [10]:
dataframes = []
for i in range(5):
    file = f"tmp/mediacloud_sources_part_{i}.csv"
    print("Reaading "+file)
    if os.path.exists(file):
        df = pd.read_csv(file)
        dataframes.append(df)
    else:
        print(f"File not found: {file}")
mediacloud_df = pd.concat(dataframes, ignore_index=True)

Reaading tmp/mediacloud_sources_part_0.csv
Reaading tmp/mediacloud_sources_part_1.csv
Reaading tmp/mediacloud_sources_part_2.csv
Reaading tmp/mediacloud_sources_part_3.csv
Reaading tmp/mediacloud_sources_part_4.csv


In [11]:
the_paperboy_df = pd.read_json(the_paperboy_sources_file)

In [12]:
the_paperboy_df["canonical_domain"] = the_paperboy_df["url"].apply(mcmetadata.urls.canonical_domain)

In [15]:
the_paperboy_df['social_media_urls'] = the_paperboy_df['social_media'].apply(extract_social_media_urls)

### Get sources that were scraped from `thepaperboy.com` but not present in MediaCloud directory  

In [16]:
mediacloud_urls = set(mediacloud_df['name'])
mask = ~the_paperboy_df['canonical_domain'].isin(mediacloud_urls)
filtered_df = the_paperboy_df[mask].copy()
filtered_df

Unnamed: 0,state,country,url,city,language,name,description,social_media,canonical_domain,social_media_urls
1,Kabul,Afghanistan,http://afghan-sports.com/,Kabul,English,Afghan Sports,,,afghan-sports.com,
17,Tirane,Albania,http://www.albania-sport.com/,Tirana,Albanian,Albania Sport,,,albania-sport.com,
30,Oran,Algeria,http://www.quotidien-oran.com/,Oran Rp,French,Le Quotidien d'Oran,,,quotidien-oran.com,
37,Saint John,Antigua and Barbuda,http://www.antiguasunonline.com/,St. John's,English,Antigua Sun,,,antiguasunonline.com,
44,Chubut,Argentina,http://www.diarioeloeste.com.ar/,Comodoro Rivadavia,Spanish,Diario El Oeste,,,diarioeloeste.com.ar,
...,...,...,...,...,...,...,...,...,...,...
11617,Wisconsin,United States,http://www.freemanol.com/,Waukesha,English,Waukesha Freeman,The Waukesha Freeman is a prominent newspaper ...,[{'Facebook': 'https://www.facebook.com/Waukes...,freemanol.com,https://www.facebook.com/WaukeshaFreeman; http...
11625,Wyoming,United States,http://www.campbellcountyobserver.net/,Gillette,English,Campbell County Observer,The Campbell County Observer newspaper is a pr...,[{'Facebook': 'https://www.facebook.com/pages/...,campbellcountyobserver.net,https://www.facebook.com/pages/Campbell-County...
11628,Wyoming,United States,http://www.cheyennenetwork.com/,Cheyenne,English,Cheyenne Network,The Cheyenne Network newspaper is a prominent ...,[{'Facebook': 'https://www.facebook.com/pages/...,cheyennenetwork.com,https://www.facebook.com/pages/Wyoming-Network...
11631,Wyoming,United States,http://www.theglenrockbird.com/,Glenrock,English,Glenrock Bird,The Glenrock Bird newspaper serves as the voic...,[{'Facebook': 'https://www.facebook.com/TheGle...,theglenrockbird.com,https://www.facebook.com/TheGlenrockBird


### Export the content to CSV

#### Exporting all thepaperboy.com sources

In [17]:
the_paperboy_df = the_paperboy_df.drop(columns=['social_media'])
the_paperboy_df.to_csv('tmp/all_sources_from_thepaperboy.csv', index=False)

#### Exporting all sources from thepaperboy.com not present in the MediaCloud directory

In [18]:
filtered_df.head()
filtered_df.to_csv('tmp/sources_in_thepaperboy_not_in_mc.csv', index=False)