In [None]:
%pip install pandas geonamescache geopy tqdm 

# Find and merge together all instances of places across NYT and Zeit coverage

The notebook pre-processes the coverage for both newspapers and output a unique csv file where each row is a country. For each country, the number of total articles in the NYT and Zeit coverage is included, as long as the unique identifier for the articles. 

The pre-processing has some differences based on the news outlet. NYT data already provides information about the nature of the keywords, allowing for an initial grouping of all keywords about geolocations. Conversely, Zeit data only comes with an array of keywords, with no additional information. However, in both cases the approach is similar: once identified keywords that relate to a geolocation, I extract the country for each one of them, then iterate over the original data to find the articles related to a specific location. Ultimately, I group together all locations based on the country, creating a unique array of articles ids, removing their duplicates, and then counting the ids within the array. 

As the last step, the two datasets are merged together based on the country.

In [None]:
import pandas as pd
import ast

from tqdm import tqdm

import requests_cache
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut,GeocoderUnavailable

import geonamescache


# NYT

In [None]:
full_year = pd.read_csv("../../input-data/temp-data.csv")

In [None]:
full_year

In [None]:
full_year_essential = full_year[["_id", "section_name", "keywords", "pub_date"]].copy()

In [None]:
full_year_essential['keywords'] = full_year_essential['keywords'].apply(ast.literal_eval)

In [None]:
# Explode keywords in separate rows
full_year_essential = full_year_essential.explode("keywords")

In [None]:
# create separate columns for keyword type and keyword
full_year_essential["keyword_type"] = full_year_essential["keywords"].apply(lambda x: x.get("name") if isinstance(x, dict) else None)
full_year_essential["keyword"] = full_year_essential["keywords"].apply(lambda x: x.get("value") if isinstance(x, dict) else None)

In [None]:
full_year_essential

In [None]:
# Isolate the keywords associated with geolocations
locations = full_year_essential[full_year_essential["keyword_type"] == "glocations"]

In [None]:
# Create a unique list of keywords
locations = locations["keyword"].unique()
locations

In [None]:
# New df with only one column, the keywords
places_df = pd.DataFrame(locations, columns=["location"])
places_df

In [None]:
# A lot of these locations are structured into Place (Country). Here, the part in brackets 
# is moved to a new column to already have a rough indication of the location country. 
places_df['country'] = places_df['location'].apply(lambda x: x[x.find("(")+1:x.find(")")] if "(" in x and ")" in x else x)

## Geolocate individual locations

In [None]:
geolocator = Nominatim(user_agent="geo_locator")

In [None]:
# Function to get country
def get_country(location):
    try:
        geo = geolocator.geocode(location, exactly_one=True, language='en', addressdetails=False)
        if geo:
            return geo.address.split(",")[-1].strip()
        else:
            return "Not found"
    except GeocoderTimedOut:
        return "Timeout"
    except GeocoderUnavailable:
        return "Unavailable"

In [None]:
places_df['retrieved_country'] = places_df.apply(lambda x: get_country(x["country"]), axis=1)

In [None]:
places_df.to_csv("../../input-data/nyt_retrived_countries.csv")

## Clean edge cases

We need to start with loading again the dataset, so if we just need to further polish the data we do not need to run the get_country function again.

In [None]:
retrieved_countries = pd.read_csv("../../input-data/nyt_retrived_countries.csv")

In [None]:
retrieved_countries = retrieved_countries.drop(labels="Unnamed: 0", axis=1)

Masking all US states with ambigous 2 letter code (not real ISO code, will default to "United States" as country).

In [None]:
us_states_mask = (retrieved_countries['country'].str.len() == 2) | (retrieved_countries['country'].str.len() == 3) | (retrieved_countries['country'] == "Los Angeles, Calif") | (retrieved_countries['country'] == "Miami, Fla")

In [None]:
retrieved_countries.loc[us_states_mask, "retrieved_country"] = "United States"

In [None]:
retrieved_countries

Masking all rows where geopy was not successful at finding a match.

In [None]:
undefined_mask = (retrieved_countries['retrieved_country'] == "Not found") | (retrieved_countries['retrieved_country'] == "Unavailable")

Some of them do not have a match because the US state name is not recognized

In [None]:
list_of_states = ['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 'COLORADO', 'CONNECTICUT', 'DELAWARE', 'FLORIDA', 'GEORGIA', 'HAWAII', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND', 'MASSACHUSETTS', 'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO', 'NEW YORK', 'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 'OREGON', 'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA', 'SOUTH DAKOTA', 'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 'VIRGINIA', 'WASHINGTON', 'WEST VIRGINIA', 'WISCONSIN', 'WYOMING']

In [None]:
undefined_countries = retrieved_countries.loc[undefined_mask]

In [None]:
undefined_countries

In [None]:
undefined_us_states = [country for country in undefined_countries["country"] if country.upper() in list_of_states]

In [None]:
undefined_us_states

In [None]:
mask_undefined_us = retrieved_countries["country"].isin(undefined_us_states)
retrieved_countries.loc[mask_undefined_us, "retrieved_country"] = "United States"

In [None]:
undefined_mask = (retrieved_countries['retrieved_country'] == "Not found") | (retrieved_countries['retrieved_country'] == "Unavailable")
undefined_countries = retrieved_countries.loc[undefined_mask]
undefined_countries

In [None]:
missing_countries = ["Armenia", "St Vincent", "Lebanon", "Mali"]

In [None]:
mask_undefined_countries = retrieved_countries["country"].isin(missing_countries)
retrieved_countries.loc[mask_undefined_countries, "retrieved_country"] = retrieved_countries.loc[mask_undefined_countries, "country"] 

In [None]:
siberia_mask = retrieved_countries["country"] == "Siberia"
retrieved_countries.loc[siberia_mask, "retrieved_country"] = "Russia"
retrieved_countries

In [None]:
west_bank_mask = retrieved_countries["country"] == "West Bank"
retrieved_countries.loc[west_bank_mask, "retrieved_country"] = "Palestinian Territory"

In [None]:
clean_countries = retrieved_countries.drop(retrieved_countries.loc[undefined_mask].index)

In [None]:
clean_countries

## Retrieve article ids from original dataset

In [None]:
data = []
for index, place in clean_countries.iterrows():
    place_mask = full_year_essential["keyword"] == place["location"]
    place_coverage = full_year_essential.loc[place_mask, "_id"]
    data.append((place["location"], place["retrieved_country"], len(place_coverage.values), place_coverage.to_list()))

In [None]:
places_and_ids_df = pd.DataFrame(data, columns=["place_keyword", "country", "count_of_articles", "ids_of_articles"])

In [None]:
places_and_ids_df

In [None]:
places_and_ids_df

## Group by country and chain ids of articles together

In [None]:
countries_and_unique_ids = places_and_ids_df.copy()

In [None]:
from itertools import chain
general_countries = (countries_and_unique_ids.groupby('country', as_index=False)['ids_of_articles']
         .agg(lambda x: list(chain.from_iterable(x)))
       )

## Remove duplicates from id list and count number of articles for each country

In [None]:
general_countries["ids_of_articles"] = general_countries["ids_of_articles"].apply(lambda x: list(set(x)))

In [None]:
general_countries["count_of_articles"] = general_countries["ids_of_articles"].apply(lambda x: len(x))

## Retrieve coordinates for each country

In [None]:
geolocator = Nominatim(user_agent="geo_lookup")
session = requests_cache.CachedSession("geopy_cache", expire_after=86400)  # Cache for 1 day
tqdm.pandas()

In [None]:
def get_coordinates(location_name, country_code=None):
    query = f"{location_name}, {country_code}" if country_code else location_name
    try:
        location = geolocator.geocode(query, timeout=10)
        if location:
            return location.latitude, location.longitude
    except Exception as e:
        print(f"Error for {query}: {e}")
    return None, None

In [None]:
general_countries[["Latitude", "Longitude"]] = general_countries.progress_apply(
    lambda row: get_coordinates(row["country"]), axis=1, result_type="expand"
)

In [None]:
general_countries

# Zeit

In [None]:
zeit_full_year = pd.read_csv("../../input-data/zeit-temp-data.csv")

In [None]:
zeit_full_year

## Create list with all unique keywords

In [None]:
zeit_full_year["keywords"] = zeit_full_year["keywords"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [None]:
zeit_full_year = zeit_full_year[zeit_full_year['keywords'].notna()]



In [None]:
all_keywords= list(zeit_full_year["keywords"])

In [None]:
all_keywords = [
    x
    for xs in all_keywords
    for x in xs
]

In [None]:
all_keywords

## Load custom dataset of countries

The dataset for Germany is a custom list with countries' names in German. The English names for countries and names for cities are pulled from geonamescache. We do not need specific names for cities because the articles tend to include both the German and English name of famous cities (e.g. Rom, Rome) or they always include the country within the list (e.g. Rom, Italien).

In [None]:
# Dataset: https://www.drupal.org/node/1136336
de_countries = pd.read_csv('../../input-data/countries_de.csv', sep='|')

In [None]:
de_countries

In [None]:
countries_de = de_countries["Country"].values

In [None]:
# Get a list of country and city names from GeoNamesCache
gc = geonamescache.GeonamesCache()
countries_en = {v['name'] for v in gc.get_countries().values()}
cities = {v['name'] for v in gc.get_cities().values()}
countries_de = de_countries["Country"].values

# Find keywords that match countries (ENG and DE) or cities
place_keywords = [word for word in all_keywords if word in countries_en or word in cities or word in countries_de]

print(place_keywords)

In [None]:
locations_in_coverage = list(set(place_keywords))

In [None]:
df_locations_in_coverage = pd.DataFrame(locations_in_coverage, columns=["location"])

In [None]:
df_locations_in_coverage

In [None]:
df_locations_in_coverage["country"] = df_locations_in_coverage.apply(lambda x: get_country(x["location"]), axis=1)

In [None]:
df_locations_in_coverage = df_locations_in_coverage.sort_values(by="country")

In [None]:
df_locations_in_coverage.to_csv("../../input-data/zeit-retrieved-countries.csv", index=False)

## Refine list of retrieved countries

The list of names requires a bit of manual refinement, hence I load a new file for adding the count. Unfortunately, it is difficult to come up with a way to automate the task, because certain keywords are ambigouous (e.g. Gardena, both a city in the US and a company in Germany) and the data coming from the API does not provide clear indication about the nature of keywords. Since the list of names is not super long, it is still possible to consider the special cases one by one. If the dataset will grow, it could be worth to consider an entirely different approach to place extraction. 

In [None]:
zeit_countries_refined = pd.read_csv("../../input-data/zeit-retrieved-countries-refined.csv")

In [None]:
zeit_countries_refined

## Match locations with article ID

In [None]:
zeit_categories = zeit_full_year.explode("keywords")

In [None]:
zeit_categories = zeit_categories[["uri", "keywords", "title", "date"]]

In [None]:
zeit_categories.rename(columns={"keywords": "keyword", "uri": "_id"}, inplace=True)

In [None]:
zeit_categories

In [None]:
# TO DO: Exception to include keywords related to Ukraine for Ukraine, like "Krieg in der Ukraine" and "Kiew"
data = []
for index, place in zeit_countries_refined.iterrows():
    place_mask = zeit_categories["keyword"] == place["location"]
    place_coverage = zeit_categories.loc[place_mask, "_id"]
    data.append((place["location"], place["country"], len(place_coverage.values), place_coverage.to_list()))

In [None]:
zeit_places_and_ids_df = pd.DataFrame(data, columns=["place_keyword", "country", "count_of_articles", "ids_of_articles"])

In [None]:
zeit_places_and_ids_df

## Group by country and chain ids of articles together

In [None]:
zeit_countries_and_unique_ids = zeit_places_and_ids_df.copy()

In [None]:
from itertools import chain
zeit_general_countries = (zeit_countries_and_unique_ids.groupby('country', as_index=False)['ids_of_articles']
         .agg(lambda x: list(chain.from_iterable(x)))
       )

In [None]:
zeit_general_countries["ids_of_articles"] = zeit_general_countries["ids_of_articles"].apply(lambda x: list(set(x)))

In [None]:
zeit_general_countries["count_of_articles"] = zeit_general_countries["ids_of_articles"].apply(lambda x: len(x))

In [None]:
zeit_general_countries

# Join data 

In [None]:
joined_locations_coverages = general_countries.merge(zeit_general_countries, how='outer', on="country", suffixes=("_nyt", "_zeit"))

## Reorder columns for better readibility

In [None]:
joined_locations_coverages_reordered = joined_locations_coverages[['country', "Latitude", "Longitude", "count_of_articles_nyt", "count_of_articles_zeit", "ids_of_articles_nyt", "ids_of_articles_zeit"]]

In [None]:
joined_locations_coverages_reordered

## Clean NaN values

In [None]:
joined_locations_coverages_reordered["count_of_articles_zeit"] = joined_locations_coverages_reordered["count_of_articles_zeit"].fillna(0)

In [None]:
joined_locations_coverages_reordered["count_of_articles_nyt"]  = joined_locations_coverages_reordered["count_of_articles_nyt"].fillna(0)


In [None]:
joined_locations_coverages_reordered['ids_of_articles_nyt'] = joined_locations_coverages_reordered['ids_of_articles_nyt'].apply(lambda d: d if isinstance(d, list) else [])

In [None]:
joined_locations_coverages_reordered['ids_of_articles_zeit'] = joined_locations_coverages_reordered['ids_of_articles_zeit'].apply(lambda d: d if isinstance(d, list) else [])

## Get coordinates

## Export

In [None]:
joined_locations_coverages_reordered.to_csv("../../data/places/coverage_by_country.csv")