In [None]:
import pandas as pd
import plotly.express as px
import ast

In [None]:
country_coverage = pd.read_csv("../../data/places/coverage_by_country.csv")

In [None]:
country_coverage

In [None]:
# Drop "Unavailable" with no articles
country_coverage.drop(index=228, inplace=True)

## Standardization of names

In [None]:
country_coverage.loc[country_coverage["country"] == "Congo-Brazzaville", "country"] = "Republic of the Congo"

In [None]:
country_coverage.loc[country_coverage["country"] == "Palestine, State of", "country"] = "Palestine"

In [None]:
country_coverage.loc[country_coverage["country"] == "Côte d'Ivoire", "country"] = "Ivory Coast"

In [None]:
country_coverage.loc[country_coverage["country"] == "The Gambia", "country"] = "Gambia"

In [None]:
country_coverage.loc[country_coverage["country"] == "Czechia", "country"] = "Czech Republic"

In [None]:
country_coverage.loc[country_coverage["country"] == "Republic of the Congo", "iso_alpha3"] = "COG"

In [None]:
country_coverage.loc[country_coverage["country"] == "Congo, The Democratic Republic of the", "country"] = "The Democratic Republic of the Congo"

In [None]:
country_coverage.loc[country_coverage["country"] == "Türkiye", "country"] = "Turkey"

In [None]:
nyt = country_coverage[(country_coverage["country"] != "United States")]

In [None]:
fig = px.choropleth(nyt, 
    locations='iso_alpha3', 
    color='count_of_articles_nyt',
    color_continuous_scale="Reds",
    range_color=(0, 1000),
    locationmode="ISO-3"
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


fig.show()

In [None]:
zeit = country_coverage[(country_coverage["country"] != "Germany")]

In [None]:
fig = px.choropleth(zeit, 
    locations='iso_alpha3', 
    color='count_of_articles_zeit',
    color_continuous_scale="Reds",
    range_color=(1, 1000),
    locationmode="ISO-3"
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Calc percentage and middle point to show delta

In [None]:
country_coverage.loc[country_coverage["country"] == "Germany", "count_of_articles_zeit"] = 0

In [None]:
country_coverage.loc[country_coverage["country"] == "United States", "count_of_articles_nyt"] = 0

In [None]:
country_coverage

In [None]:
country_coverage["perc_of_coverage_nyt"] =  country_coverage["count_of_articles_nyt"].apply(lambda x: x / country_coverage["count_of_articles_nyt"].sum() * 100)

In [None]:
country_coverage["perc_of_coverage_zeit"] =  country_coverage["count_of_articles_zeit"].apply(lambda x: x / country_coverage["count_of_articles_zeit"].sum() * 100)

In [None]:
country_coverage["middle_point"] = country_coverage["perc_of_coverage_nyt"] - country_coverage["perc_of_coverage_zeit"]

Negative values are Zeit, Positive values are the NYT

In [None]:
country_coverage

## Assign countries to one or the other outlet

In [None]:
covered_by_zeit = country_coverage[country_coverage["middle_point"] < 0]

In [None]:
covered_by_zeit = covered_by_zeit[["country", "iso_alpha3", "middle_point", "count_of_articles_zeit", "count_of_articles_nyt"]]

In [None]:
covered_by_zeit

In [None]:
covered_by_nyt = country_coverage[country_coverage["middle_point"] > 0]

In [None]:
covered_by_nyt = covered_by_nyt[["country", "iso_alpha3", "middle_point", "count_of_articles_nyt", "count_of_articles_zeit"]]

### Load continental outline to identify regions

In [None]:
european_entities = pd.read_csv('../../input-data/utilities/europe.txt', names=['Country'])
asia_entities = pd.read_csv('../../input-data/utilities/asia.txt', names=['Country'])
africa_entities = pd.read_csv('../../input-data/utilities/africa.txt', names=['Country'])
n_america_entities = pd.read_csv('../../input-data/utilities/north-america.txt', names=['Country'])
s_america_entities = pd.read_csv('../../input-data/utilities/south-america.txt', names=['Country'])
oceania_entities = pd.read_csv('../../input-data/utilities/oceania.txt', names=['Country'])
ant_entities = pd.read_csv('../../input-data/utilities/antarctica.txt', names=['Country'])
other_entities = pd.read_csv('../../input-data/utilities/other.txt', names=['Country'])


In [None]:
european_entities["Region"] = "Europe"
asia_entities["Region"] = "Asia"
africa_entities["Region"] = "Africa"
n_america_entities["Region"] = "North America"
s_america_entities["Region"] = "South America"
oceania_entities["Region"] = "Oceania"
ant_entities["Region"] = "Antarctica"
other_entities["Region"] = "Other"

In [None]:
regions_and_countries = pd.concat([european_entities, asia_entities, africa_entities, n_america_entities, s_america_entities, oceania_entities, other_entities,ant_entities])

In [None]:
regions_and_countries = regions_and_countries.rename(columns={"Country": "country"})

In [None]:
regions_and_countries = regions_and_countries.map(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
regions_and_countries

In [None]:
regions_zeit = covered_by_zeit.merge(regions_and_countries, how="left", left_on="country", right_on="country")

In [None]:
regions_zeit.drop(index=83, inplace=True)

In [None]:
regions_zeit.drop(index=40, inplace=True)

In [None]:
regions_zeit["outlet"] = "Zeit"

In [None]:
regions_nyt = covered_by_nyt.merge(regions_and_countries, how="left", left_on="country", right_on="country")

In [None]:
regions_nyt["outlet"] = "NYT"

In [None]:
regions_nyt = regions_nyt[['country', 'iso_alpha3', 'middle_point', 'count_of_articles_zeit', 'count_of_articles_nyt', 'Region', 'outlet']]

In [None]:
regions_nyt.columns

In [None]:
regions_zeit.columns

In [None]:
coverage_by_outlet_and_region = pd.concat([regions_zeit, regions_nyt])

### Create a nested JSON for Voronoi

In [None]:
out = coverage_by_outlet_and_region.groupby(["outlet","Region"])[['country', 'iso_alpha3', 'middle_point', 'count_of_articles_zeit', 'count_of_articles_nyt']].apply(lambda x: x.to_dict('records')).reset_index(name='countries')

In [None]:
out = out.groupby(["outlet"])[['Region', 'countries']].apply(lambda x: x.to_dict('records')).reset_index(name='regions')

In [None]:
out

In [None]:
out.to_json("../../data/places/coverage_by_region.json", orient="records")

In [None]:
fig = px.choropleth(country_coverage, 
    locations='iso_alpha3', 
    color='middle_point',
    color_continuous_scale="PiYG",
    range_color=(-0.01, 0.01),
    locationmode="ISO-3"
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
country_coverage.to_csv("../../data/places/coverage_by_country.csv", index=False)

## Retrieve related categories for each location

### NYT

In [None]:
full_year = pd.read_csv("../../input-data/temp-data.csv")

In [None]:
country_coverage

In [None]:
zeit_full_year = pd.read_csv("../../input-data/zeit-temp-data.csv")

In [None]:
zeit_full_year

In [None]:
full_year['keywords'] = full_year['keywords'].apply(ast.literal_eval)

In [None]:
zeit_full_year = zeit_full_year[zeit_full_year['keywords'].notna()]

In [None]:
zeit_full_year['keywords'] = zeit_full_year['keywords'].apply(ast.literal_eval)

In [None]:
zeit_full_year_exploded = zeit_full_year.explode("keywords")

In [None]:
zeit_full_year_exploded = zeit_full_year_exploded.rename(columns={"uri": "_id"})

In [None]:
zeit_full_year_exploded

In [None]:
zeit_full_year_exploded = zeit_full_year_exploded[["_id", "date", "url", "keywords"]]

In [None]:
zeit_full_year_exploded.rename(columns={"uri": "_id"}, inplace=True)

In [None]:
zeit_full_year_exploded

In [None]:
full_year_exploded = full_year.explode("keywords")

In [None]:
country_coverage["ids_of_articles_nyt"] = country_coverage["ids_of_articles_nyt"].apply(ast.literal_eval)

In [None]:
country_coverage["ids_of_articles_zeit"] = country_coverage["ids_of_articles_zeit"].apply(ast.literal_eval)

In [None]:
countries = country_coverage["country"].unique()

In [None]:
list_of_keywords_dicts = []
for country in countries:
    country_selection = country_coverage[country_coverage["country"] == country]
    nyt_ids = country_selection["ids_of_articles_nyt"].values
    nyt_ids = nyt_ids[0]
    keywords_per_country = []
    
    for id in nyt_ids:
        article = full_year_exploded[full_year_exploded["_id"] == id]
        keywords_list = article["keywords"]
        list_of_keywords_per_article = list(keywords_list.apply(lambda x: x.get("value")))
        keywords_per_country.extend(list_of_keywords_per_article)

    my_dict = {i:{keywords_per_country.count(i)} for i in keywords_per_country}
    list_of_keywords_dicts.append(my_dict)
    print(len(list_of_keywords_dicts))

In [None]:
country_coverage["keywords_nyt"] = list_of_keywords_dicts

In [None]:
country_coverage

In [None]:
country_coverage

In [None]:
zeit_full_year_exploded

In [None]:
list_of_zeit_keywords_dicts = []
for country in countries:
    print(country)
    country_selection = country_coverage[country_coverage["country"] == country]
    ids = country_selection["ids_of_articles_zeit"].values
    print(country_selection["ids_of_articles_zeit"].values)
    ids = ids[0]  # Assuming ids[0] is already a list
    zeit_keywords_per_country = []
    for id in ids:  # No need for eval() here
        article = zeit_full_year_exploded[zeit_full_year_exploded["_id"] == id]
        keyword = article["keywords"].values
        zeit_keywords_per_country.extend(keyword)
    
    my_dict = {i: zeit_keywords_per_country.count(i) for i in zeit_keywords_per_country}
    print("length of current dict", len(my_dict.keys()))
    list_of_zeit_keywords_dicts.append(my_dict)
    print("all dicts", len(list_of_zeit_keywords_dicts))

In [None]:
country_coverage["keywords_zeit"] = list_of_zeit_keywords_dicts
country_coverage

In [None]:
country_coverage["keywords_zeit"] = country_coverage["keywords_zeit"].apply(lambda x:{key: value for key, value in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
country_coverage["keywords_nyt"] = country_coverage["keywords_nyt"].apply(lambda x:{key: value for key, value in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
country_coverage.to_csv("../../data/places/coverage_by_country.csv", index=False)