# Extract articles that mention countries, states, or cities as categories 

In [None]:
%pip install pycountry geonamescache geopy requests_cache tqdm

In [349]:
import time
import pycountry
import pandas as pd
from geopy.geocoders import Nominatim
import requests_cache
from pandas.core.common import flatten
import ast
import re
from tqdm import tqdm

## Find all unique instances of places in NYT coverage

In [350]:
full_year = pd.read_csv("../../input-data/temp-data.json")

In [351]:
full_year.columns

Index(['Unnamed: 0', 'abstract', 'web_url', 'snippet', 'lead_paragraph',
       'print_section', 'print_page', 'source', 'multimedia', 'headline',
       'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name',
       'byline', 'type_of_material', '_id', 'word_count', 'uri',
       'subsection_name'],
      dtype='object')

In [352]:
full_year_essential = full_year[["_id", "section_name", "keywords", "pub_date"]].copy()

In [353]:
full_year_essential

Unnamed: 0,_id,section_name,keywords,pub_date
0,nyt://article/da8532bd-f9bd-5ca3-9e7e-afef6e9f...,Business Day,"[{'name': 'subject', 'value': 'Careers and Pro...",2024-09-01T04:01:07+0000
1,nyt://article/aeabc262-aeb0-5423-a7ac-8bb664cb...,World,"[{'name': 'glocations', 'value': 'India', 'ran...",2024-09-01T04:01:25+0000
2,nyt://article/42c0d0f2-ea62-5d2b-8eba-baa04180...,World,"[{'name': 'glocations', 'value': 'Maracaibo (V...",2024-09-01T04:01:27+0000
3,nyt://article/6393c6c3-0e1f-5494-925d-165e7aaf...,World,"[{'name': 'subject', 'value': 'Israel-Gaza War...",2024-09-01T04:01:43+0000
4,nyt://article/fe046102-78e5-530d-89e0-59ff09c0...,World,"[{'name': 'glocations', 'value': 'Germany', 'r...",2024-09-01T04:01:43+0000
...,...,...,...,...
48691,nyt://article/15ef03c9-295b-50e0-a0f4-64f9a182...,U.S.,"[{'name': 'subject', 'value': 'Deaths (Obituar...",2024-08-31T14:57:12+0000
48692,nyt://article/3a3d339e-87ab-5650-b797-b7bb3cb0...,World,"[{'name': 'subject', 'value': 'Military Aircra...",2024-08-31T14:57:53+0000
48693,nyt://article/1192db0c-51bd-525b-abb4-e8607c11...,Opinion,"[{'name': 'subject', 'value': 'internal-sub-on...",2024-08-31T15:00:03+0000
48694,nyt://article/83b24708-09af-55b1-baac-7efca171...,Food,"[{'name': 'subject', 'value': 'Cooking and Coo...",2024-08-31T15:00:03+0000


In [354]:
full_year_essential['keywords'] = full_year_essential['keywords'].apply(ast.literal_eval)

In [355]:
# Converts the object-like string in keywords to a list
full_year_essential['keywords'] = full_year_essential['keywords'].apply(lambda x: [keyword['value'] for keyword in x])

In [356]:
full_year_essential

Unnamed: 0,_id,section_name,keywords,pub_date
0,nyt://article/da8532bd-f9bd-5ca3-9e7e-afef6e9f...,Business Day,"[Careers and Professions, Hiring and Promotion...",2024-09-01T04:01:07+0000
1,nyt://article/aeabc262-aeb0-5423-a7ac-8bb664cb...,World,"[India, Hospitals, Workplace Hazards and Viola...",2024-09-01T04:01:25+0000
2,nyt://article/42c0d0f2-ea62-5d2b-8eba-baa04180...,World,"[Maracaibo (Venezuela), Population, Immigratio...",2024-09-01T04:01:27+0000
3,nyt://article/6393c6c3-0e1f-5494-925d-165e7aaf...,World,"[Israel-Gaza War (2023- ), Vaccination and Imm...",2024-09-01T04:01:43+0000
4,nyt://article/fe046102-78e5-530d-89e0-59ff09c0...,World,"[Germany, Elections, State Legislature, SAXONY...",2024-09-01T04:01:43+0000
...,...,...,...,...
48691,nyt://article/15ef03c9-295b-50e0-a0f4-64f9a182...,U.S.,"[Deaths (Obituaries), Fort Lee (Va), United St...",2024-08-31T14:57:12+0000
48692,nyt://article/3a3d339e-87ab-5650-b797-b7bb3cb0...,World,"[Military Aircraft, Friendly Fire (Military an...",2024-08-31T14:57:53+0000
48693,nyt://article/1192db0c-51bd-525b-abb4-e8607c11...,Opinion,"[internal-sub-only-nl, Presidential Election o...",2024-08-31T15:00:03+0000
48694,nyt://article/83b24708-09af-55b1-baac-7efca171...,Food,"[Cooking and Cookbooks, Content Type: Service]",2024-08-31T15:00:03+0000


In [357]:
list_of_keywords = []

for article in full_year_essential['keywords']:
    for keywords in article:
        list_of_keywords.append(keywords)

In [358]:
unique_keywords = list(set(list_of_keywords))

### Coverage about foreign countries and cities

Creating a dictionary with all possible countries, then filtering the original dataset to keep only the rows where at least one keyword is fund.

In [359]:
country_dict = {country.name: country.alpha_2 for country in pycountry.countries}

### Coverage about US States

Define a list of US state codes and names, then filter the dataset to keep relevant articles.

In [360]:
country_codes = {country_dict[word] for word in unique_keywords if word in country_dict}

In [361]:
subdivision_dict = {subdivision.code for subdivision in pycountry.subdivisions if subdivision.country_code in country_codes}

In [362]:
us_state_codes = [s.removeprefix("US-") for s in subdivision_dict if 'US-' in s]

In [363]:
us_state_names = {state.name: state.code.removeprefix("US-") for state in pycountry.subdivisions if state.country_code == 'US'}

In [364]:
us_states_full_names = list(us_state_names.keys())

### Merge the three different datasets and remove possible duplicates

In [None]:
coverage_about_places = pd.concat([coverage_about_ny, coverage_about_abroad, coverage_about_US])
coverage_about_places

In [None]:
coverage_about_places = coverage_about_places.drop_duplicates(subset='_id')

In [None]:
coverage_about_places

In [None]:
coverage_about_places.to_json("../../data/nyt-coverage-places.json", orient="records")

## What countries are covered the most?

In [365]:
list_of_place_keywords = []

for article in full_year_essential['keywords']:
    for keywords in article:
        list_of_place_keywords.append(keywords)

In [366]:
country_dict

{'Aruba': 'AW',
 'Afghanistan': 'AF',
 'Angola': 'AO',
 'Anguilla': 'AI',
 'Åland Islands': 'AX',
 'Albania': 'AL',
 'Andorra': 'AD',
 'United Arab Emirates': 'AE',
 'Argentina': 'AR',
 'Armenia': 'AM',
 'American Samoa': 'AS',
 'Antarctica': 'AQ',
 'French Southern Territories': 'TF',
 'Antigua and Barbuda': 'AG',
 'Australia': 'AU',
 'Austria': 'AT',
 'Azerbaijan': 'AZ',
 'Burundi': 'BI',
 'Belgium': 'BE',
 'Benin': 'BJ',
 'Bonaire, Sint Eustatius and Saba': 'BQ',
 'Burkina Faso': 'BF',
 'Bangladesh': 'BD',
 'Bulgaria': 'BG',
 'Bahrain': 'BH',
 'Bahamas': 'BS',
 'Bosnia and Herzegovina': 'BA',
 'Saint Barthélemy': 'BL',
 'Belarus': 'BY',
 'Belize': 'BZ',
 'Bermuda': 'BM',
 'Bolivia, Plurinational State of': 'BO',
 'Brazil': 'BR',
 'Barbados': 'BB',
 'Brunei Darussalam': 'BN',
 'Bhutan': 'BT',
 'Bouvet Island': 'BV',
 'Botswana': 'BW',
 'Central African Republic': 'CF',
 'Canada': 'CA',
 'Cocos (Keeling) Islands': 'CC',
 'Switzerland': 'CH',
 'Chile': 'CL',
 'China': 'CN',
 "Côte d'Iv

In [367]:
countries_df = pd.DataFrame(country_dict.items(), columns=['geo_name', 'code'])
countries_df

Unnamed: 0,geo_name,code
0,Aruba,AW
1,Afghanistan,AF
2,Angola,AO
3,Anguilla,AI
4,Åland Islands,AX
...,...,...
244,Samoa,WS
245,Yemen,YE
246,South Africa,ZA
247,Zambia,ZM


In [368]:
countries_df["name"] = countries_df["geo_name"]

In [369]:
countries_df["name"] = countries_df["name"].str.replace(',.*', '', regex=True)

In [370]:
countries_df.loc[countries_df["name"]=="Russian Federation", 'name'] = "Russia"

In [371]:
countries_df.loc[countries_df["name"]=="Palestine", 'name'] = "Gaza Strip"

In [372]:
matches_for_places = []
for index, row in countries_df.iterrows():
    place = row["name"]
    if place != "Georgia":
        matches = [p for p in full_year_essential["keywords"] if place.lower() in map(str.lower, p)]
    else:
        matches = []
    
    matches_for_places.append(len(matches))


In [373]:
countries_df["count"] = matches_for_places

## What US states are covered the most?

In [374]:
us_states_df = pd.DataFrame(us_state_names.items(), columns=['geo_name', 'code'])
us_states_df

Unnamed: 0,geo_name,code
0,Alaska,AK
1,Alabama,AL
2,Arkansas,AR
3,American Samoa,AS
4,Arizona,AZ
5,California,CA
6,Colorado,CO
7,Connecticut,CT
8,District of Columbia,DC
9,Delaware,DE


In [375]:
us_states_df["name"] = us_states_df["geo_name"]

In [376]:
matches_for_states = []
for index, row in us_states_df.iterrows():
    place = row["name"]
    matches = [p for p in full_year_essential["keywords"] if place.lower() in map(str.lower, p)]

    matches_for_states.append(len(matches))


In [377]:
us_states_df["count"] = matches_for_states

In [378]:
us_states_df

Unnamed: 0,geo_name,code,name,count
0,Alaska,AK,Alaska,58
1,Alabama,AL,Alabama,146
2,Arkansas,AR,Arkansas,57
3,American Samoa,AS,American Samoa,3
4,Arizona,AZ,Arizona,308
5,California,CA,California,817
6,Colorado,CO,Colorado,155
7,Connecticut,CT,Connecticut,73
8,District of Columbia,DC,District of Columbia,0
9,Delaware,DE,Delaware,48


In [379]:
countries_df.loc[countries_df['name'] == 'United States', 'count'] = us_states_df["count"].sum()

In [380]:
sum_of_coverage_about_places = pd.concat([countries_df, us_states_df])
sum_of_coverage_about_places

Unnamed: 0,geo_name,code,name,count
0,Aruba,AW,Aruba,2
1,Afghanistan,AF,Afghanistan,87
2,Angola,AO,Angola,11
3,Anguilla,AI,Anguilla,2
4,Åland Islands,AX,Åland Islands,0
...,...,...,...,...
52,Vermont,VT,Vermont,68
53,Washington,WA,Washington,0
54,Wisconsin,WI,Wisconsin,194
55,West Virginia,WV,West Virginia,72


In [381]:
sum_of_coverage_about_places.to_json("../../data/nyt-sum-places.json", orient="records")

## What are other categories related to places? And how many articles have been published about a country/state over time?

In [402]:
def count_related_kws(df):
    kws_dictionaries = []
    dates_dictionaries = []
    for index, country in df.iterrows():
        country_name = country["name"]
        country_coverage = full_year_essential[full_year_essential["keywords"].apply(lambda x: any(keyword.lower() == country_name.lower() for keyword in x))]
        list_of_keywords = list(flatten(country_coverage["keywords"]))
        set_of_keywords = set(list_of_keywords)
        list_of_dates = list(flatten(country_coverage["pub_date"]))
        set_of_dates = set(list_of_dates)
        
        count_of_kw = []
        count_of_dates = []
        
        for kw in set_of_keywords:
            count_of_kw.append(list_of_keywords.count(kw))

        for date in set_of_dates:
            count_of_dates.append(list_of_dates.count(date))
        
        list_of_keywords = list(set(list_of_keywords))
        kws_dictionary = dict(zip(list_of_keywords, count_of_kw))

        list_of_dates = list(set(list_of_dates))
        dates_dictionary = dict(zip(list_of_dates, count_of_dates))
        
        kws_dictionaries.append([kws_dictionary])
        dates_dictionaries.append([dates_dictionary])


    print(kws_dictionary)

    df["related_keywords"] = kws_dictionaries
    df["timeline"] = dates_dictionaries

In [403]:
count_related_kws(us_states_df)
count_related_kws(countries_df)

{'Florida': 7, 'Wisconsin': 9, 'Spina Bifida': 1, 'Native Americans': 1, 'Animals': 1, 'Domestic Service': 1, 'Haaland, Deb': 1, 'Elections, Mayors': 2, 'Ur-Energy Incorporated': 1, 'Language and Languages': 1, 'House Freedom Caucus': 1, 'Minnesota': 8, 'United States': 5, 'Uranium': 1, 'Australia': 1, 'Elections, Senate': 7, 'Kansas': 8, 'Tiller, George R': 1, 'Hawaii': 7, 'Animal Behavior': 1, 'State Legislatures': 2, 'Primaries and Caucuses': 5, 'Kentucky': 6, 'Trump, Donald J': 1, 'Greenhouse Gas Emissions': 2, 'Parks and Other Recreation Areas': 1, 'Arkansas': 6, 'West Virginia': 7, 'Biodiversity': 2, 'Democratic Party': 1, 'Federal Lands': 1, 'Hunting and Trapping': 1, 'Gates, Bill': 2, 'Deaths (Fatalities)': 1, 'Kidnapping and Hostages': 1, 'TerraPower': 1, 'Cold Wind (Book)': 1, 'Wolves': 1, 'Nevada': 7, 'Bloomberg, Michael R': 1, 'Oregon': 6, 'Transgender': 1, 'Vermont': 7, 'Wells': 1, 'Utah': 7, 'Land Use Policies': 1, 'Maryland': 7, 'Water Pollution': 1, 'Global Warming': 6,

In [404]:
us_states_df

Unnamed: 0,geo_name,code,name,count,related_keywords,timeline
0,Alaska,AK,Alaska,58,"[{'Animals': 2, 'Avian Influenza': 1, 'Science...","[{'2024-10-15T07:00:13+0000': 1, '2024-12-12T1..."
1,Alabama,AL,Alabama,146,"[{'Biden, Hunter': 1, 'National Assn for the A...","[{'2024-03-28T09:05:52+0000': 1, '2024-04-08T1..."
2,Arkansas,AR,Arkansas,57,"[{'AFRICAN-AMERICAN MUSEUM': 1, 'Rain': 2, 'Fl...","[{'2024-07-05T17:01:32+0000': 1, '2024-11-05T2..."
3,American Samoa,AS,American Samoa,3,"[{'AMERICAN SAMOA': 3, 'Biden, Joseph R Jr': 1...","[{'2024-03-06T03:55:18+0000': 1, '2024-08-22T1..."
4,Arizona,AZ,Arizona,308,"[{'Palestinians': 2, 'Toma, Ben': 1, 'King, Ch...","[{'2024-11-07T01:25:34+0000': 1, '2024-02-18T1..."
5,California,CA,California,817,"[{'Tahoe City (Calif)': 1, 'Pistachio Nuts': 1...","[{'2024-11-25T10:03:37+0000': 1, '2024-06-02T1..."
6,Colorado,CO,Colorado,155,"[{'Sundance Institute': 1, 'Milwaukee (Wis)': ...","[{'2024-06-18T16:25:40+0000': 1, '2024-01-24T1..."
7,Connecticut,CT,Connecticut,73,"[{'Nonprofit Organizations': 1, 'Florida': 7, ...","[{'2024-04-20T16:58:16+0000': 1, '2024-08-19T0..."
8,District of Columbia,DC,District of Columbia,0,[{}],[{}]
9,Delaware,DE,Delaware,48,[{'Federal Criminal Case Against Trump (Docume...,"[{'2024-06-13T21:13:08+0000': 1, '2024-02-15T0..."


In [405]:
countries_df

Unnamed: 0,geo_name,code,name,count,related_keywords,timeline
0,Aruba,AW,Aruba,2,"[{'Vermont': 1, 'Computers and the Internet': ...","[{'2024-07-26T09:03:20+0000': 1, '2024-04-05T0..."
1,Afghanistan,AF,Afghanistan,87,"[{'Palestinians': 1, 'Scholz, Olaf (1958- )': ...","[{'2024-03-31T13:00:12+0000': 1, '2024-07-13T0..."
2,Angola,AO,Angola,11,"[{'Labor and Jobs': 1, 'Economic Conditions an...","[{'2024-12-04T18:54:27+0000': 1, '2024-11-28T1..."
3,Anguilla,AI,Anguilla,2,"[{'Economic Conditions and Trends': 1, 'Budget...","[{'2024-04-26T09:03:23+0000': 1, '2024-03-22T0..."
4,Åland Islands,AX,Åland Islands,0,[{}],[{}]
...,...,...,...,...,...,...
244,Samoa,WS,Samoa,1,"[{'Samoa': 1, 'Philippines': 1, 'Pacific Ocean...",[{'2024-04-24T15:21:45+0000': 1}]
245,Yemen,YE,Yemen,88,"[{'Raisi, Ebrahim': 1, 'Palestinians': 12, 'De...","[{'2024-11-07T01:25:34+0000': 1, '2024-01-10T2..."
246,South Africa,ZA,South Africa,102,"[{'Palestinians': 12, 'Israel-Gaza War (2023- ...","[{'2024-10-04T09:03:32+0000': 1, '2024-01-26T0..."
247,Zambia,ZM,Zambia,6,"[{'Palestinians': 1, 'Content Type: Personal P...","[{'2024-07-11T09:01:32+0000': 1, '2024-07-27T0..."


## Geographic coordinates for countries and states

In [406]:
geolocator = Nominatim(user_agent="geo_lookup")
session = requests_cache.CachedSession("geopy_cache", expire_after=86400)  # Cache for 1 day
tqdm.pandas()


In [407]:
def get_coordinates(location_name, country_code=None):
    query = f"{location_name}, {country_code}" if location_name else country_code
    try:
        location = geolocator.geocode(query, timeout=10)
        if location:
            return location.latitude, location.longitude
    except Exception as e:
        print(f"Error for {query}: {e}")
    return None, None

In [408]:
countries_df[["Latitude", "Longitude"]] = countries_df.progress_apply(
    lambda row: get_coordinates(row["geo_name"], row["code"]), axis=1, result_type="expand"
)


100%|██████████| 249/249 [04:36<00:00,  1.11s/it]


In [409]:
us_states_df[["Latitude", "Longitude"]] = us_states_df.progress_apply(
    lambda row: get_coordinates(row["geo_name"], row["code"]), axis=1, result_type="expand"
)

100%|██████████| 57/57 [01:06<00:00,  1.17s/it]


In [410]:
countries_df['Context']='Global'

In [411]:
us_states_df['Context']='Local'

In [412]:
places_with_categories = pd.concat([countries_df, us_states_df])

In [413]:
places_with_categories

Unnamed: 0,geo_name,code,name,count,related_keywords,timeline,Latitude,Longitude,Context
0,Aruba,AW,Aruba,2,"[{'Vermont': 1, 'Computers and the Internet': ...","[{'2024-07-26T09:03:20+0000': 1, '2024-04-05T0...",12.517566,-69.981864,Global
1,Afghanistan,AF,Afghanistan,87,"[{'Palestinians': 1, 'Scholz, Olaf (1958- )': ...","[{'2024-03-31T13:00:12+0000': 1, '2024-07-13T0...",33.768006,66.238514,Global
2,Angola,AO,Angola,11,"[{'Labor and Jobs': 1, 'Economic Conditions an...","[{'2024-12-04T18:54:27+0000': 1, '2024-11-28T1...",-11.877577,17.569124,Global
3,Anguilla,AI,Anguilla,2,"[{'Economic Conditions and Trends': 1, 'Budget...","[{'2024-04-26T09:03:23+0000': 1, '2024-03-22T0...",18.195495,-63.075023,Global
4,Åland Islands,AX,Åland Islands,0,[{}],[{}],60.117825,19.936017,Global
...,...,...,...,...,...,...,...,...,...
52,Vermont,VT,Vermont,68,"[{'Palestinians': 1, 'Rain': 5, 'Wisconsin': 8...","[{'2024-11-05T19:48:46+0000': 1, '2024-11-29T1...",44.473737,-73.194147,Local
53,Washington,WA,Washington,0,[{}],[{}],45.578765,-122.351047,Local
54,Wisconsin,WI,Wisconsin,194,"[{'Milwaukee (Wis)': 11, 'Baraboo (Wis)': 1, '...","[{'2024-12-02T10:00:49+0000': 1, '2024-08-13T2...",43.074698,-89.384169,Local
55,West Virginia,WV,West Virginia,72,"[{'Florida': 8, 'Rain': 1, 'Wisconsin': 8, 'In...","[{'2024-04-03T06:25:54+0000': 1, '2024-02-27T2...",39.634840,-79.954210,Local


In [414]:
places_with_categories.to_json("../../data/places/nyt-sum-places.json", orient="records")