In [None]:
import pandas as pd
import plotly.express as px
import ast

In [None]:
country_coverage = pd.read_csv("../../data/places/coverage_by_country.csv")

In [None]:
country_coverage

In [None]:
nyt = country_coverage[(country_coverage["country"] != "United States")]

In [None]:
fig = px.choropleth(country_coverage, 
    locations='iso_alpha3', 
    color='count_of_articles_nyt',
    color_continuous_scale="Reds",
    range_color=(0, 1000),
    locationmode="ISO-3"
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


fig.show()

In [None]:
zeit = country_coverage[(country_coverage["country"] != "Germany")]

In [None]:
fig = px.choropleth(country_coverage, 
    locations='iso_alpha3', 
    color='count_of_articles_zeit',
    color_continuous_scale="Reds",
    range_color=(1, 1000),
    locationmode="ISO-3"
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Calc percentage and middle point to show delta

In [None]:
country_coverage["perc_of_coverage_nyt"] =  country_coverage["count_of_articles_nyt"].apply(lambda x: x / country_coverage["count_of_articles_nyt"].sum())

In [None]:
country_coverage["perc_of_coverage_zeit"] =  country_coverage["count_of_articles_zeit"].apply(lambda x: x / country_coverage["count_of_articles_zeit"].sum())

In [None]:
country_coverage["middle_point"] = country_coverage["perc_of_coverage_nyt"] - country_coverage["perc_of_coverage_zeit"]

Negative values are Zeit, Positive values are the NYT

In [None]:
fig = px.choropleth(country_coverage, 
    locations='iso_alpha3', 
    color='middle_point',
    color_continuous_scale="PiYG",
    range_color=(-0.01, 0.01),
    locationmode="ISO-3"
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
country_coverage.to_csv("../../data/places/coverage_by_country.csv", index=False)

## Retrieve related categories for each location

### NYT

In [None]:
full_year = pd.read_csv("../../input-data/temp-data.csv")

In [None]:
country_coverage

In [None]:
zeit_full_year = pd.read_csv("../../input-data/zeit-temp-data.csv")

In [None]:
zeit_full_year

In [None]:
full_year['keywords'] = full_year['keywords'].apply(ast.literal_eval)

In [None]:
zeit_full_year = zeit_full_year[zeit_full_year['keywords'].notna()]

In [None]:
zeit_full_year['keywords'] = zeit_full_year['keywords'].apply(ast.literal_eval)

In [None]:
zeit_full_year_exploded = zeit_full_year.explode("keywords")

In [None]:
zeit_full_year_exploded

In [None]:
zeit_full_year_exploded = zeit_full_year_exploded[["_id", "date", "url", "keywords"]]

In [None]:
zeit_full_year_exploded.rename(columns={"uri": "_id"}, inplace=True)

In [None]:
zeit_full_year_exploded

In [None]:
full_year_exploded = full_year.explode("keywords")

In [None]:
country_coverage["ids_of_articles_nyt"] = country_coverage["ids_of_articles_nyt"].apply(ast.literal_eval)

In [None]:
country_coverage["ids_of_articles_zeit"] = country_coverage["ids_of_articles_zeit"].apply(ast.literal_eval)

In [None]:
countries = country_coverage["country"].unique()

In [None]:
list_of_keywords_dicts = []
for country in countries:
    country_selection = country_coverage[country_coverage["country"] == country]
    nyt_ids = country_selection["ids_of_articles_nyt"].values
    nyt_ids = nyt_ids[0]
    keywords_per_country = []
    
    for id in nyt_ids:
        article = full_year_exploded[full_year_exploded["_id"] == id]
        keywords_list = article["keywords"]
        list_of_keywords_per_article = list(keywords_list.apply(lambda x: x.get("value")))
        keywords_per_country.extend(list_of_keywords_per_article)

    my_dict = {i:{keywords_per_country.count(i)} for i in keywords_per_country}
    list_of_keywords_dicts.append(my_dict)
    print(len(list_of_keywords_dicts))

In [None]:
country_coverage["keywords_nyt"] = list_of_keywords_dicts

In [None]:
country_coverage

In [None]:
country_coverage

In [None]:
zeit_full_year_exploded

In [None]:
list_of_zeit_keywords_dicts = []
for country in countries:
    print(country)
    country_selection = country_coverage[country_coverage["country"] == country]
    ids = country_selection["ids_of_articles_zeit"].values
    ids = ids[0]
    zeit_keywords_per_country = []
    for id in eval(ids):
        article = zeit_full_year_exploded[zeit_full_year_exploded["_id"] == id]
        keyword = article["keywords"].values
        zeit_keywords_per_country.extend(keyword)
    
    my_dict = {i:zeit_keywords_per_country.count(i) for i in zeit_keywords_per_country}
    print("length of current dict", len(my_dict.keys()))
    list_of_zeit_keywords_dicts.append(my_dict)
    print("all dicts", len(list_of_zeit_keywords_dicts))

In [None]:
country_coverage["keywords_zeit"] = list_of_zeit_keywords_dicts
country_coverage

In [None]:
country_coverage["keywords_zeit"] = country_coverage["keywords_zeit"].apply(lambda x:{key: value for key, value in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
country_coverage["keywords_nyt"] = country_coverage["keywords_nyt"].apply(lambda x:{key: value for key, value in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
country_coverage.to_csv("../../data/places/coverage_by_country.csv", index=False)