# Extract articles that mention countries, states, or cities as categories 

In [None]:
%pip install pycountry geonamescache geopy requests_cache tqdm

In [None]:
import time
import pycountry
import pandas as pd
from geopy.geocoders import Nominatim
import requests_cache
from pandas.core.common import flatten
import ast
import re
from tqdm import tqdm

## Find all unique instances of places in NYT coverage

In [197]:
full_year = pd.read_csv("../../input-data/temp-data.json")

In [198]:
full_year.columns

Index(['Unnamed: 0', 'abstract', 'web_url', 'snippet', 'lead_paragraph',
       'print_section', 'print_page', 'source', 'multimedia', 'headline',
       'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name',
       'byline', 'type_of_material', '_id', 'word_count', 'uri',
       'subsection_name'],
      dtype='object')

In [199]:
full_year_essential = full_year[["_id", "section_name", "keywords", "pub_date"]].copy()

In [200]:
full_year_essential

Unnamed: 0,_id,section_name,keywords,pub_date
0,nyt://article/da8532bd-f9bd-5ca3-9e7e-afef6e9f...,Business Day,"[{'name': 'subject', 'value': 'Careers and Pro...",2024-09-01T04:01:07+0000
1,nyt://article/aeabc262-aeb0-5423-a7ac-8bb664cb...,World,"[{'name': 'glocations', 'value': 'India', 'ran...",2024-09-01T04:01:25+0000
2,nyt://article/42c0d0f2-ea62-5d2b-8eba-baa04180...,World,"[{'name': 'glocations', 'value': 'Maracaibo (V...",2024-09-01T04:01:27+0000
3,nyt://article/6393c6c3-0e1f-5494-925d-165e7aaf...,World,"[{'name': 'subject', 'value': 'Israel-Gaza War...",2024-09-01T04:01:43+0000
4,nyt://article/fe046102-78e5-530d-89e0-59ff09c0...,World,"[{'name': 'glocations', 'value': 'Germany', 'r...",2024-09-01T04:01:43+0000
...,...,...,...,...
48691,nyt://article/15ef03c9-295b-50e0-a0f4-64f9a182...,U.S.,"[{'name': 'subject', 'value': 'Deaths (Obituar...",2024-08-31T14:57:12+0000
48692,nyt://article/3a3d339e-87ab-5650-b797-b7bb3cb0...,World,"[{'name': 'subject', 'value': 'Military Aircra...",2024-08-31T14:57:53+0000
48693,nyt://article/1192db0c-51bd-525b-abb4-e8607c11...,Opinion,"[{'name': 'subject', 'value': 'internal-sub-on...",2024-08-31T15:00:03+0000
48694,nyt://article/83b24708-09af-55b1-baac-7efca171...,Food,"[{'name': 'subject', 'value': 'Cooking and Coo...",2024-08-31T15:00:03+0000


In [201]:
full_year_essential['keywords'] = full_year_essential['keywords'].apply(ast.literal_eval)

In [202]:
# Converts the object-like string in keywords to a list
full_year_essential['keywords'] = full_year_essential['keywords'].apply(lambda x: [keyword['value'] for keyword in x])

In [203]:
full_year_essential

Unnamed: 0,_id,section_name,keywords,pub_date
0,nyt://article/da8532bd-f9bd-5ca3-9e7e-afef6e9f...,Business Day,"[Careers and Professions, Hiring and Promotion...",2024-09-01T04:01:07+0000
1,nyt://article/aeabc262-aeb0-5423-a7ac-8bb664cb...,World,"[India, Hospitals, Workplace Hazards and Viola...",2024-09-01T04:01:25+0000
2,nyt://article/42c0d0f2-ea62-5d2b-8eba-baa04180...,World,"[Maracaibo (Venezuela), Population, Immigratio...",2024-09-01T04:01:27+0000
3,nyt://article/6393c6c3-0e1f-5494-925d-165e7aaf...,World,"[Israel-Gaza War (2023- ), Vaccination and Imm...",2024-09-01T04:01:43+0000
4,nyt://article/fe046102-78e5-530d-89e0-59ff09c0...,World,"[Germany, Elections, State Legislature, SAXONY...",2024-09-01T04:01:43+0000
...,...,...,...,...
48691,nyt://article/15ef03c9-295b-50e0-a0f4-64f9a182...,U.S.,"[Deaths (Obituaries), Fort Lee (Va), United St...",2024-08-31T14:57:12+0000
48692,nyt://article/3a3d339e-87ab-5650-b797-b7bb3cb0...,World,"[Military Aircraft, Friendly Fire (Military an...",2024-08-31T14:57:53+0000
48693,nyt://article/1192db0c-51bd-525b-abb4-e8607c11...,Opinion,"[internal-sub-only-nl, Presidential Election o...",2024-08-31T15:00:03+0000
48694,nyt://article/83b24708-09af-55b1-baac-7efca171...,Food,"[Cooking and Cookbooks, Content Type: Service]",2024-08-31T15:00:03+0000


In [204]:
list_of_keywords = []

for article in full_year_essential['keywords']:
    for keywords in article:
        list_of_keywords.append(keywords)

In [205]:
unique_keywords = list(set(list_of_keywords))

### Coverage about New York

Assuming that everything in the new York section is related to NYC, I create a dataset including only this section.

In [206]:
coverage_about_ny = full_year_essential[full_year_essential['section_name'] == 'New York']

In [207]:
coverage_about_ny

Unnamed: 0,_id,section_name,keywords,pub_date
11,nyt://article/fbd74938-d198-5111-a322-256674ee...,New York,[New York City],2024-09-01T07:00:23+0000
13,nyt://article/cfeb776c-7468-5f9c-96b1-cfee972d...,New York,"[Strazzullo, Sal, Bars and Nightclubs, Robberi...",2024-09-01T07:00:35+0000
55,nyt://article/78b260c3-4d2a-5200-bfc1-176274a5...,New York,"[United States Politics and Government, Presid...",2024-09-01T19:38:41+0000
77,nyt://article/6f4a73e6-8886-5682-8468-09a76de6...,New York,"[United States Open (Tennis), Tennis, Stadiums...",2024-09-02T07:00:27+0000
78,nyt://article/06c1a953-425e-53c3-bbad-8860df0e...,New York,"[Pedestrian Malls, Education (K-12), Roads and...",2024-09-02T07:00:31+0000
...,...,...,...,...
48626,nyt://article/06e74b83-0482-53ed-8dfc-2077d0a3...,New York,"[Pro-Palestinian Campus Protests (2023- ), Isr...",2024-08-30T21:05:54+0000
48648,nyt://article/bcae809a-57ae-5b4a-a574-168659e5...,New York,"[Writing and Writers, Running, Books and Liter...",2024-08-31T07:00:24+0000
48650,nyt://article/c30f17c2-0e48-564d-a518-b4388527...,New York,"[Urban Areas, Homosexuality and Bisexuality, P...",2024-08-31T07:00:46+0000
48651,nyt://article/3c477e41-f076-5db6-8c9a-fa5cbaa9...,New York,"[Roller Derby, Law and Legislation, Civil Righ...",2024-08-31T07:00:47+0000


### Coverage about foreign countries and cities

Creating a dictionary with all possible countries, then filtering the original dataset to keep only the rows where at least one keyword is fund.

In [208]:
country_dict = {country.name: country.alpha_2 for country in pycountry.countries}

In [209]:
coverage_about_abroad = full_year_essential[full_year_essential['keywords'].apply(lambda x: any(keyword in country_dict for keyword in x))]

In [210]:
coverage_about_abroad

Unnamed: 0,_id,section_name,keywords,pub_date
1,nyt://article/aeabc262-aeb0-5423-a7ac-8bb664cb...,World,"[India, Hospitals, Workplace Hazards and Viola...",2024-09-01T04:01:25+0000
3,nyt://article/6393c6c3-0e1f-5494-925d-165e7aaf...,World,"[Israel-Gaza War (2023- ), Vaccination and Imm...",2024-09-01T04:01:43+0000
4,nyt://article/fe046102-78e5-530d-89e0-59ff09c0...,World,"[Germany, Elections, State Legislature, SAXONY...",2024-09-01T04:01:43+0000
7,nyt://article/8c93a8fe-604e-5c56-b025-18fe292e...,World,"[Goldberg-Polin, Hersh, Israel-Gaza War (2023-...",2024-09-01T04:17:40+0000
17,nyt://article/39a6950a-15db-59d2-baaf-f0d9509b...,Magazine,"[Currency, Numismatics, Lobbying and Lobbyists...",2024-09-01T09:00:57+0000
...,...,...,...,...
48680,nyt://article/e375e0ea-81ae-54af-ae3c-d964bb6b...,World,"[Israel, West Bank, Israel-Gaza War (2023- ), ...",2024-08-31T10:27:33+0000
48681,nyt://article/24ccc50f-f6c6-56c0-a9ff-400a72d4...,Opinion,"[Capital Punishment, Justice Department, Biden...",2024-08-31T11:00:06+0000
48682,nyt://article/c88482cb-671c-578e-b9e7-f0410f4e...,Opinion,"[Weather, Global Warming, Heat and Heat Waves,...",2024-08-31T11:00:06+0000
48692,nyt://article/3a3d339e-87ab-5650-b797-b7bb3cb0...,World,"[Military Aircraft, Friendly Fire (Military an...",2024-08-31T14:57:53+0000


### Coverage about US States

Define a list of US state codes and names, then filter the dataset to keep relevant articles.

In [211]:
country_codes = {country_dict[word] for word in unique_keywords if word in country_dict}

In [212]:
subdivision_dict = {subdivision.code for subdivision in pycountry.subdivisions if subdivision.country_code in country_codes}

In [213]:
us_state_codes = [s.removeprefix("US-") for s in subdivision_dict if 'US-' in s]

In [214]:
us_state_names = {state.name: state.code.removeprefix("US-") for state in pycountry.subdivisions if state.country_code == 'US'}

In [215]:
us_states_full_names = list(us_state_names.keys())

This bit helps in disambiguating between different capitalization of states and includes the full name of the state if no code is used.

In [216]:
pattern = re.compile(r".+ \(.+?,? ([A-Za-z]{2})\)$")
places_in_US = []
for item in unique_keywords:
    match = pattern.match(item)
    if item in us_states_full_names:
            places_in_US.append(item)
    if match:
        state = match.group(1)
        normalized_state = state.upper() if len(state) == 2 else state  # Convert 2-letter codes to uppercase
        if normalized_state in us_state_codes or state in us_state_names:
            places_in_US.append(item)

In [217]:
coverage_about_US = full_year_essential[full_year_essential['keywords'].apply(lambda x: any(keyword in places_in_US for keyword in x))]

In [218]:
coverage_about_US

Unnamed: 0,_id,section_name,keywords,pub_date
12,nyt://article/6b7f2395-9c71-5f1f-bbd0-4a0aebf6...,Times Insider,"[New York Times, Times Square and 42nd Street ...",2024-09-01T07:00:35+0000
13,nyt://article/cfeb776c-7468-5f9c-96b1-cfee972d...,New York,"[Strazzullo, Sal, Bars and Nightclubs, Robberi...",2024-09-01T07:00:35+0000
14,nyt://article/d26bb35c-3a54-52d4-aa69-57940450...,Books,"[Books and Literature, Writing and Writers, Co...",2024-09-01T09:00:18+0000
41,nyt://article/4abde6a3-37f6-5f6e-8799-525b9509...,U.S.,"[Presidential Election of 2024, Vice President...",2024-09-01T13:01:15+0000
51,nyt://article/5610726f-00ed-507c-b539-4dbd4501...,Business Day,"[Vandalism, Radio, Decisions and Verdicts, Pub...",2024-09-01T18:50:10+0000
...,...,...,...,...
48658,nyt://article/a7d4a7da-45e0-5c61-8932-1c022415...,Climate,"[Global Warming, Greenhouse Gas Emissions, Haz...",2024-08-31T09:01:08+0000
48681,nyt://article/24ccc50f-f6c6-56c0-a9ff-400a72d4...,Opinion,"[Capital Punishment, Justice Department, Biden...",2024-08-31T11:00:06+0000
48682,nyt://article/c88482cb-671c-578e-b9e7-f0410f4e...,Opinion,"[Weather, Global Warming, Heat and Heat Waves,...",2024-08-31T11:00:06+0000
48687,nyt://article/dec0b620-0ca8-5ad1-8a2e-1103cdd0...,Opinion,"[Floods, Deaths (Fatalities), Deserts, Rain, G...",2024-08-31T11:00:39+0000


### Merge the three different datasets and remove possible duplicates

In [219]:
coverage_about_places = pd.concat([coverage_about_ny, coverage_about_abroad, coverage_about_US])
coverage_about_places

Unnamed: 0,_id,section_name,keywords,pub_date
11,nyt://article/fbd74938-d198-5111-a322-256674ee...,New York,[New York City],2024-09-01T07:00:23+0000
13,nyt://article/cfeb776c-7468-5f9c-96b1-cfee972d...,New York,"[Strazzullo, Sal, Bars and Nightclubs, Robberi...",2024-09-01T07:00:35+0000
55,nyt://article/78b260c3-4d2a-5200-bfc1-176274a5...,New York,"[United States Politics and Government, Presid...",2024-09-01T19:38:41+0000
77,nyt://article/6f4a73e6-8886-5682-8468-09a76de6...,New York,"[United States Open (Tennis), Tennis, Stadiums...",2024-09-02T07:00:27+0000
78,nyt://article/06c1a953-425e-53c3-bbad-8860df0e...,New York,"[Pedestrian Malls, Education (K-12), Roads and...",2024-09-02T07:00:31+0000
...,...,...,...,...
48658,nyt://article/a7d4a7da-45e0-5c61-8932-1c022415...,Climate,"[Global Warming, Greenhouse Gas Emissions, Haz...",2024-08-31T09:01:08+0000
48681,nyt://article/24ccc50f-f6c6-56c0-a9ff-400a72d4...,Opinion,"[Capital Punishment, Justice Department, Biden...",2024-08-31T11:00:06+0000
48682,nyt://article/c88482cb-671c-578e-b9e7-f0410f4e...,Opinion,"[Weather, Global Warming, Heat and Heat Waves,...",2024-08-31T11:00:06+0000
48687,nyt://article/dec0b620-0ca8-5ad1-8a2e-1103cdd0...,Opinion,"[Floods, Deaths (Fatalities), Deserts, Rain, G...",2024-08-31T11:00:39+0000


In [220]:
coverage_about_places = coverage_about_places.drop_duplicates(subset='_id')

In [221]:
coverage_about_places

Unnamed: 0,_id,section_name,keywords,pub_date
11,nyt://article/fbd74938-d198-5111-a322-256674ee...,New York,[New York City],2024-09-01T07:00:23+0000
13,nyt://article/cfeb776c-7468-5f9c-96b1-cfee972d...,New York,"[Strazzullo, Sal, Bars and Nightclubs, Robberi...",2024-09-01T07:00:35+0000
55,nyt://article/78b260c3-4d2a-5200-bfc1-176274a5...,New York,"[United States Politics and Government, Presid...",2024-09-01T19:38:41+0000
77,nyt://article/6f4a73e6-8886-5682-8468-09a76de6...,New York,"[United States Open (Tennis), Tennis, Stadiums...",2024-09-02T07:00:27+0000
78,nyt://article/06c1a953-425e-53c3-bbad-8860df0e...,New York,"[Pedestrian Malls, Education (K-12), Roads and...",2024-09-02T07:00:31+0000
...,...,...,...,...
48640,nyt://article/0670790e-18d3-5dd0-92a9-c26a6df1...,U.S.,"[Presidential Election of 2024, Abortion, In V...",2024-08-31T00:24:11+0000
48643,nyt://article/a746e516-976d-543c-8347-cd2b6122...,Business Day,"[Shopping Centers and Malls, Real Estate (Comm...",2024-08-31T04:00:27+0000
48656,nyt://article/e5ff274e-6bd0-5589-be49-7517e1f0...,U.S.,"[State Legislatures, Vetoes (US), Politics and...",2024-08-31T09:00:48+0000
48658,nyt://article/a7d4a7da-45e0-5c61-8932-1c022415...,Climate,"[Global Warming, Greenhouse Gas Emissions, Haz...",2024-08-31T09:01:08+0000


In [222]:
coverage_about_places.to_json("../../data/nyt-coverage-places.json", orient="records")

## What countries are covered the most?

In [223]:
list_of_place_keywords = []

for article in coverage_about_places['keywords']:
    for keywords in article:
        list_of_place_keywords.append(keywords)

In [224]:
country_dict

{'Aruba': 'AW',
 'Afghanistan': 'AF',
 'Angola': 'AO',
 'Anguilla': 'AI',
 'Åland Islands': 'AX',
 'Albania': 'AL',
 'Andorra': 'AD',
 'United Arab Emirates': 'AE',
 'Argentina': 'AR',
 'Armenia': 'AM',
 'American Samoa': 'AS',
 'Antarctica': 'AQ',
 'French Southern Territories': 'TF',
 'Antigua and Barbuda': 'AG',
 'Australia': 'AU',
 'Austria': 'AT',
 'Azerbaijan': 'AZ',
 'Burundi': 'BI',
 'Belgium': 'BE',
 'Benin': 'BJ',
 'Bonaire, Sint Eustatius and Saba': 'BQ',
 'Burkina Faso': 'BF',
 'Bangladesh': 'BD',
 'Bulgaria': 'BG',
 'Bahrain': 'BH',
 'Bahamas': 'BS',
 'Bosnia and Herzegovina': 'BA',
 'Saint Barthélemy': 'BL',
 'Belarus': 'BY',
 'Belize': 'BZ',
 'Bermuda': 'BM',
 'Bolivia, Plurinational State of': 'BO',
 'Brazil': 'BR',
 'Barbados': 'BB',
 'Brunei Darussalam': 'BN',
 'Bhutan': 'BT',
 'Bouvet Island': 'BV',
 'Botswana': 'BW',
 'Central African Republic': 'CF',
 'Canada': 'CA',
 'Cocos (Keeling) Islands': 'CC',
 'Switzerland': 'CH',
 'Chile': 'CL',
 'China': 'CN',
 "Côte d'Iv

In [None]:
countries_df = pd.DataFrame(country_dict.items(), columns=['name', 'code'])
countries_df

In [225]:
countries_df["name"] = countries_df["name"].str.replace(',.*', '', regex=True)

In [None]:
matches_for_places = []
for index, row in countries_df.iterrows():
    place = row["name"]
    if place != "Georgia":
        matches = [p for p in coverage_about_places["keywords"] if place.lower() in map(str.lower, p)]
    else:
        matches = []
    
    matches_for_places.append(len(matches))


In [None]:
countries_df["count"] = matches_for_places

In [None]:
countries_df.loc[countries_df['name'] == 'United States', 'count'] = len(coverage_about_US)

## What US states are covered the most?

In [None]:
us_states_df = pd.DataFrame(us_state_names.items(), columns=['name', 'code'])
us_states_df

In [None]:
matches_for_states = []
for index, row in us_states_df.iterrows():
    place = row["name"]
    matches = [p for p in coverage_about_places["keywords"] if place.lower() in map(str.lower, p)]

    matches_for_states.append(len(matches))


In [None]:
us_states_df["count"] = matches_for_states

In [None]:
us_states_df

In [None]:
sum_of_coverage_about_places = pd.concat([countries_df, us_states_df])
sum_of_coverage_about_places

In [None]:
sum_of_coverage_about_places.to_json("../../data/nyt-sum-places.json", orient="records")

## What are other categories related to places?

In [None]:
def count_related_kws(df):
    dictionaries = []
    for index, country in df.iterrows():
        country_name = country["name"]
        country_coverage = coverage_about_places[coverage_about_places["keywords"].apply(lambda x: any(keyword.lower() == country_name.lower() for keyword in x))]
        list_of_keywords = list(flatten(country_coverage["keywords"]))
        set_of_keywords = set(list_of_keywords)
        count_of_kw = []
        
        for kw in set_of_keywords:
            count_of_kw.append(list_of_keywords.count(kw))
        
        list_of_keywords = list(set(list_of_keywords))
        dictionary = dict(zip(list_of_keywords, count_of_kw))
        dictionaries.append(dictionary)

    df["related_keywords"] = dictionaries

In [None]:
count_related_kws(us_states_df)
count_related_kws(countries_df)

In [None]:
us_states_df

In [None]:
countries_df

## Geographic coordinates for countries and states

In [None]:
geolocator = Nominatim(user_agent="geo_lookup")
session = requests_cache.CachedSession("geopy_cache", expire_after=86400)  # Cache for 1 day
tqdm.pandas()


In [None]:
def get_coordinates(location_name, country_code=None):
    query = f"{location_name}, {country_code}" if country_code else location_name
    try:
        location = geolocator.geocode(query, timeout=10)
        if location:
            return location.latitude, location.longitude
    except Exception as e:
        print(f"Error for {query}: {e}")
    return None, None

In [None]:
countries_df[["Latitude", "Longitude"]] = countries_df.progress_apply(
    lambda row: get_coordinates(row["name"], row["code"]), axis=1, result_type="expand"
)


In [None]:
us_states_df[["Latitude", "Longitude"]] = us_states_df.progress_apply(
    lambda row: get_coordinates(row["name"], row["code"]), axis=1, result_type="expand"
)

In [None]:
countries_df['Context']='Global'

In [None]:
us_states_df['Context']='Local'

In [None]:
places_with_categories = pd.concat([countries_df, us_states_df])

In [None]:
places_with_categories.to_json("../../data/places/nyt-sum-places.json", orient="records")