# Extract articles that mention countries, states, or cities as categories 

In [None]:
%pip install pycountry geonamescache geopy requests_cache tqdm

In [None]:
import time
import pycountry
import pandas as pd
from geopy.geocoders import Nominatim
import requests_cache
from pandas.core.common import flatten
import ast
import re
from tqdm import tqdm

## Find all unique instances of places in NYT coverage

In [None]:
full_year = pd.read_csv("../../input-data/temp-data.json")

In [None]:
full_year.columns

In [None]:
full_year_essential = full_year[["_id", "section_name", "keywords", "pub_date"]].copy()

In [None]:
full_year_essential

In [None]:
full_year_essential['keywords'] = full_year_essential['keywords'].apply(ast.literal_eval)

In [None]:
# Converts the object-like string in keywords to a list
full_year_essential['keywords'] = full_year_essential['keywords'].apply(lambda x: [keyword['value'] for keyword in x])

In [None]:
full_year_essential

In [None]:
list_of_keywords = []

for article in full_year_essential['keywords']:
    for keywords in article:
        list_of_keywords.append(keywords)

In [None]:
unique_keywords = list(set(list_of_keywords))

### Coverage about New York

Assuming that everything in the new York section is related to NYC, I create a dataset including only this section.

In [None]:
coverage_about_ny = full_year_essential[full_year_essential['section_name'] == 'New York']

In [None]:
coverage_about_ny

### Coverage about foreign countries and cities

Creating a dictionary with all possible countries, then filtering the original dataset to keep only the rows where at least one keyword is fund.

In [None]:
country_dict = {country.name: country.alpha_2 for country in pycountry.countries}

In [None]:
coverage_about_abroad = full_year_essential[full_year_essential['keywords'].apply(lambda x: any(keyword in country_dict for keyword in x))]

In [None]:
coverage_about_abroad

### Coverage about US States

Define a list of US state codes and names, then filter the dataset to keep relevant articles.

In [None]:
country_codes = {country_dict[word] for word in unique_keywords if word in country_dict}

In [None]:
subdivision_dict = {subdivision.code for subdivision in pycountry.subdivisions if subdivision.country_code in country_codes}

In [None]:
us_state_codes = [s.removeprefix("US-") for s in subdivision_dict if 'US-' in s]

In [None]:
us_state_names = {state.name: state.code.removeprefix("US-") for state in pycountry.subdivisions if state.country_code == 'US'}

In [None]:
us_states_full_names = list(us_state_names.keys())

This bit helps in disambiguating between different capitalization of states and includes the full name of the state if no code is used.

In [None]:
pattern = re.compile(r".+ \(.+?,? ([A-Za-z]{2})\)$")
places_in_US = []
for item in unique_keywords:
    match = pattern.match(item)
    if item in us_states_full_names:
            places_in_US.append(item)
    if match:
        state = match.group(1)
        normalized_state = state.upper() if len(state) == 2 else state  # Convert 2-letter codes to uppercase
        if normalized_state in us_state_codes or state in us_state_names:
            places_in_US.append(item)

In [None]:
coverage_about_US = full_year_essential[full_year_essential['keywords'].apply(lambda x: any(keyword in places_in_US for keyword in x))]

In [None]:
coverage_about_US

### Merge the three different datasets and remove possible duplicates

In [None]:
coverage_about_places = pd.concat([coverage_about_ny, coverage_about_abroad, coverage_about_US])
coverage_about_places

In [None]:
coverage_about_places = coverage_about_places.drop_duplicates(subset='_id')

In [None]:
coverage_about_places

In [None]:
coverage_about_places.to_json("../../data/nyt-coverage-places.json", orient="records")

## What countries are covered the most?

In [None]:
list_of_place_keywords = []

for article in coverage_about_places['keywords']:
    for keywords in article:
        list_of_place_keywords.append(keywords)

In [None]:
countries_df = pd.DataFrame(country_dict.items(), columns=['name', 'code'])
countries_df

In [None]:
matches_for_places = []
for index, row in countries_df.iterrows():
    place = row["name"]
    if place != "Georgia":
        matches = [p for p in coverage_about_places["keywords"] if place.lower() in map(str.lower, p)]
    else:
        matches = []
    
    matches_for_places.append(len(matches))


In [None]:
countries_df["count"] = matches_for_places

In [None]:
countries_df.loc[countries_df['name'] == 'United States', 'count'] = len(coverage_about_US)

## What US states are covered the most?

In [None]:
us_states_df = pd.DataFrame(us_state_names.items(), columns=['name', 'code'])
us_states_df

In [None]:
matches_for_states = []
for index, row in us_states_df.iterrows():
    place = row["name"]
    matches = [p for p in coverage_about_places["keywords"] if place.lower() in map(str.lower, p)]

    matches_for_states.append(len(matches))


In [None]:
us_states_df["count"] = matches_for_states

In [None]:
us_states_df

In [None]:
sum_of_coverage_about_places = pd.concat([countries_df, us_states_df])
sum_of_coverage_about_places

In [None]:
sum_of_coverage_about_places.to_json("../../data/nyt-sum-places.json", orient="records")

## What are other categories related to places?

In [None]:
def count_related_kws(df):
    dictionaries = []
    for index, country in df.iterrows():
        country_name = country["name"]
        country_coverage = coverage_about_places[coverage_about_places["keywords"].apply(lambda x: any(keyword.lower() == country_name.lower() for keyword in x))]
        list_of_keywords = list(flatten(country_coverage["keywords"]))
        set_of_keywords = set(list_of_keywords)
        count_of_kw = []
        
        for kw in set_of_keywords:
            count_of_kw.append(list_of_keywords.count(kw))
        
        list_of_keywords = list(set(list_of_keywords))
        dictionary = dict(zip(list_of_keywords, count_of_kw))
        dictionaries.append(dictionary)

    df["related_keywords"] = dictionaries

In [None]:
count_related_kws(us_states_df)
count_related_kws(countries_df)

In [None]:
us_states_df

In [None]:
countries_df

## Geographic coordinates for countries and states

In [None]:
geolocator = Nominatim(user_agent="geo_lookup")
session = requests_cache.CachedSession("geopy_cache", expire_after=86400)  # Cache for 1 day
tqdm.pandas()


In [None]:
def get_coordinates(location_name, country_code=None):
    query = f"{location_name}, {country_code}" if country_code else location_name
    try:
        location = geolocator.geocode(query, timeout=10)
        if location:
            return location.latitude, location.longitude
    except Exception as e:
        print(f"Error for {query}: {e}")
    return None, None

In [None]:
countries_df[["Latitude", "Longitude"]] = countries_df.progress_apply(
    lambda row: get_coordinates(row["name"], row["code"]), axis=1, result_type="expand"
)


In [None]:
us_states_df[["Latitude", "Longitude"]] = us_states_df.progress_apply(
    lambda row: get_coordinates(row["name"], row["code"]), axis=1, result_type="expand"
)

In [None]:
countries_df['Context']='Global'

In [None]:
us_states_df['Context']='Local'

In [None]:
places_with_categories = pd.concat([countries_df, us_states_df])

In [None]:
places_with_categories.to_json("../../data/places/nyt-sum-places.json", orient="records")