# Comparing Sources: MediaCloud Geographic Online News Collections vs. thepaperboy.com

In [1]:
import pandas as pd
import requests
import mcmetadata

In [2]:
def download_file_from_google_drive(file_id, output_filename):
    url = f"https://drive.google.com/uc?id={file_id}"
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(output_filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully as {output_filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        

In [3]:
the_paperboy_sources_file="the_paperboy_sources.json"
download_file_from_google_drive("1VViBh3TnpNqgh5zyYu58iuQbA1nuEGOZ",the_paperboy_sources_file)

File downloaded successfully as the_paperboy_sources.json


In [4]:
mediacloud_sources_file="mediacloud_sources.json"
download_file_from_google_drive("1j53R2Zwx8al1YSFJ2VahTBrGVQbwxcJz",mediacloud_sources_file)

File downloaded successfully as mediacloud_sources.json


In [12]:
the_paperboy_df = pd.read_json(the_paperboy_sources_file)

In [13]:
mediacloud_df = pd.read_json(mediacloud_sources_file)

In [14]:
the_paperboy_df["canonical_domain"] = the_paperboy_df["url"].apply(mcmetadata.urls.canonical_domain)

### Get sources that were scraped from `thepaperboy.com` but not present in MediaCloud directory  

In [29]:
mediacloud_urls = set(mediacloud_df['name'])
mask = ~the_paperboy_df['canonical_domain'].isin(mediacloud_urls)
filtered_df = the_paperboy_df[mask].copy()
filtered_df

Unnamed: 0,state,country,url,city,language,name,description,social_media,canonical_domain
1,Kabul,Afghanistan,http://afghan-sports.com/,Kabul,English,Afghan Sports,,,afghan-sports.com
10,Kabul,Afghanistan,http://www.sabawoon.com/,Kabul,English/Pashto,Sabawoon,,,sabawoon.com
17,Tirane,Albania,http://www.albania-sport.com/,Tirana,Albanian,Albania Sport,,,albania-sport.com
22,Tirane,Albania,http://www.tanmarket.com/php/,Tirana,Albanian,Tan Portal,,,tanmarket.com
28,Alger,Algeria,http://www.akhbarelyoum.dz/ar/index.php,Algiers,Arabic,El Youm,,,akhbarelyoum.dz
...,...,...,...,...,...,...,...,...,...
11626,Wyoming,United States,http://www.casperjournal.com/,Casper,English,Casper Journal,The Casper Journal newspaper has long been a t...,[{'Facebook': 'https://www.facebook.com/pages/...,casperjournal.com
11628,Wyoming,United States,http://www.cheyennenetwork.com/,Cheyenne,English,Cheyenne Network,The Cheyenne Network newspaper is a prominent ...,[{'Facebook': 'https://www.facebook.com/pages/...,cheyennenetwork.com
11631,Wyoming,United States,http://www.theglenrockbird.com/,Glenrock,English,Glenrock Bird,The Glenrock Bird newspaper serves as the voic...,[{'Facebook': 'https://www.facebook.com/TheGle...,theglenrockbird.com
11635,Wyoming,United States,http://www.jacksonholenet.com/,Jackson,English,Jackson Hole Net,The Jackson Hole Net is a prominent newspaper ...,[{'Facebook': 'https://www.facebook.com/Jackso...,jacksonholenet.com


### Summary of Sources by country

In [43]:
country_counts = filtered_df.groupby('country').size()

In [44]:
for country, count in country_counts.items():
    print(f"{country} - [{count}]")

Afghanistan - [2]
Albania - [2]
Algeria - [4]
Antigua and Barbuda - [1]
Argentina - [14]
Armenia - [3]
Australia - [124]
Austria - [19]
Azerbaijan - [8]
Bahrain - [5]
Bangladesh - [14]
Barbados - [1]
Belarus - [4]
Belgium - [4]
Belize - [3]
Bolivia - [10]
Bosnia and Herzegovina - [5]
Botswana - [2]
Brazil - [10]
Bulgaria - [13]
Burkina Faso - [1]
Cambodia - [5]
Canada - [230]
Cayman Islands - [1]
Chile - [15]
China - [11]
Colombia - [5]
Costa Rica - [5]
Croatia - [12]
Cyprus - [9]
Czech Republic - [15]
Democratic Republic of Congo - [1]
Denmark - [21]
Dominica - [1]
Dominican Republic - [15]
Dutch Antilles - [1]
Ecuador - [10]
Egypt - [19]
El Salvador - [3]
England - [380]
Estonia - [9]
Ethiopia - [1]
Finland - [52]
France - [35]
Gambia - [1]
Georgia - [3]
Germany - [135]
Ghana - [3]
Greece - [16]
Guam - [1]
Guatemala - [4]
Guinea - [4]
Haiti - [1]
Honduras - [6]
Hong Kong - [6]
Hungary - [9]
Iceland - [2]
India - [164]
Indonesia - [19]
Iran - [6]
Iraq - [4]
Ireland - [36]
Isle of Man 