In [411]:
import pandas as pd
import plotly.express as px
import ast

In [None]:
country_coverage = pd.read_csv("../../data/places/coverage_by_country.csv")

In [412]:
nyt_country_coverage = pd.read_csv("./local/voronoi/nyt_24_25_coverage_by_country.csv")

In [None]:
country_coverage

In [413]:
nyt_country_coverage

Unnamed: 0,country,ids_of_articles,count_of_articles
0,Adriatic Sea,['nyt://article/6c5919fe-452a-532d-8234-f698a3...,1
1,Aegean Sea,['nyt://interactive/6addaf57-b7fa-5d0f-af24-ec...,3
2,Afghanistan,['nyt://article/1bf3d7c8-419e-52d7-a720-c4e054...,112
3,Africa,['nyt://article/34e514ce-6ebc-58f1-bf4b-d750ea...,201
4,Albania,['nyt://article/59eb4293-7d0a-558f-9c26-d85c3f...,23
...,...,...,...
231,Vietnam,['nyt://article/670e1dfd-7217-5337-a5b8-0b7a7d...,52
232,Yellow Sea,['nyt://article/593fcdd3-0e07-58b1-9a5d-eec8cd...,1
233,Yemen,['nyt://article/083f2303-65dd-5db5-b970-cf1755...,73
234,Zambia,['nyt://article/ce946de8-b1a7-58a9-9235-321a9b...,13


In [None]:
# Drop "Unavailable" with no articles
country_coverage.drop(index=228, inplace=True)

## Standardization of names

In [415]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "Congo-Brazzaville", "country"] = "Republic of the Congo"

In [417]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "Palestinian Territory", "country"] = "Palestine"

In [418]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "Côte d'Ivoire", "country"] = "Ivory Coast"

In [419]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "The Gambia", "country"] = "Gambia"

In [420]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "Czechia", "country"] = "Czech Republic"

In [421]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "Republic of the Congo", "iso_alpha3"] = "COG"

In [422]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "Congo, The Democratic Republic of the", "country"] = "The Democratic Republic of the Congo"

In [424]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "Türkiye", "country"] = "Turkey"

In [425]:
nyt = nyt_country_coverage[(nyt_country_coverage["country"] != "United States")]

In [None]:
fig = px.choropleth(nyt, 
    locations='iso_alpha3', 
    color='count_of_articles_nyt',
    color_continuous_scale="Reds",
    range_color=(0, 1000),
    locationmode="ISO-3"
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


fig.show()

In [None]:
zeit = country_coverage[(country_coverage["country"] != "Germany")]

In [None]:
fig = px.choropleth(zeit, 
    locations='iso_alpha3', 
    color='count_of_articles_zeit',
    color_continuous_scale="Reds",
    range_color=(1, 1000),
    locationmode="ISO-3"
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Calc percentage and middle point to show delta

In [None]:
country_coverage.loc[country_coverage["country"] == "Germany", "count_of_articles_zeit"] = 0

In [426]:
nyt_country_coverage.loc[nyt_country_coverage["country"] == "United States", "count_of_articles"] = 0

In [427]:
nyt_country_coverage

Unnamed: 0,country,ids_of_articles,count_of_articles,iso_alpha3
0,Adriatic Sea,['nyt://article/6c5919fe-452a-532d-8234-f698a3...,1,
1,Aegean Sea,['nyt://interactive/6addaf57-b7fa-5d0f-af24-ec...,3,
2,Afghanistan,['nyt://article/1bf3d7c8-419e-52d7-a720-c4e054...,112,
3,Africa,['nyt://article/34e514ce-6ebc-58f1-bf4b-d750ea...,201,
4,Albania,['nyt://article/59eb4293-7d0a-558f-9c26-d85c3f...,23,
...,...,...,...,...
231,Vietnam,['nyt://article/670e1dfd-7217-5337-a5b8-0b7a7d...,52,
232,Yellow Sea,['nyt://article/593fcdd3-0e07-58b1-9a5d-eec8cd...,1,
233,Yemen,['nyt://article/083f2303-65dd-5db5-b970-cf1755...,73,
234,Zambia,['nyt://article/ce946de8-b1a7-58a9-9235-321a9b...,13,


In [428]:
nyt_country_coverage["perc_of_coverage"] =  nyt_country_coverage["count_of_articles"].apply(lambda x: x / nyt_country_coverage["count_of_articles"].sum() * 100)

In [None]:
country_coverage["perc_of_coverage_zeit"] =  country_coverage["count_of_articles_zeit"].apply(lambda x: x / country_coverage["count_of_articles_zeit"].sum() * 100)

In [None]:
country_coverage["middle_point"] = country_coverage["perc_of_coverage_nyt"] - country_coverage["perc_of_coverage_zeit"]

Negative values are Zeit, Positive values are the NYT

In [None]:
country_coverage

## Assign countries to one or the other outlet

In [None]:
covered_by_zeit = country_coverage[country_coverage["middle_point"] < 0]

In [None]:
covered_by_zeit = covered_by_zeit[["country", "iso_alpha3", "middle_point", "count_of_articles_zeit", "count_of_articles_nyt"]]

In [None]:
covered_by_zeit

In [None]:
covered_by_nyt = country_coverage[country_coverage["middle_point"] > 0]

In [None]:
covered_by_nyt = covered_by_nyt[["country", "iso_alpha3", "middle_point", "count_of_articles_nyt", "count_of_articles_zeit"]]

### Load continental outline to identify regions

In [429]:
european_entities = pd.read_csv('../../input-data/utilities/europe.txt', names=['Country'])
asia_entities = pd.read_csv('../../input-data/utilities/asia.txt', names=['Country'])
africa_entities = pd.read_csv('../../input-data/utilities/africa.txt', names=['Country'])
n_america_entities = pd.read_csv('../../input-data/utilities/north-america.txt', names=['Country'])
s_america_entities = pd.read_csv('../../input-data/utilities/south-america.txt', names=['Country'])
oceania_entities = pd.read_csv('../../input-data/utilities/oceania.txt', names=['Country'])
ant_entities = pd.read_csv('../../input-data/utilities/antarctica.txt', names=['Country'])
other_entities = pd.read_csv('../../input-data/utilities/other.txt', names=['Country'])


In [430]:
european_entities["Region"] = "Europe"
asia_entities["Region"] = "Asia"
africa_entities["Region"] = "Africa"
n_america_entities["Region"] = "North America"
s_america_entities["Region"] = "South America"
oceania_entities["Region"] = "Oceania"
ant_entities["Region"] = "Antarctica"
other_entities["Region"] = "Other"

In [431]:
regions_and_countries = pd.concat([european_entities, asia_entities, africa_entities, n_america_entities, s_america_entities, oceania_entities, other_entities,ant_entities])

In [432]:
regions_and_countries = regions_and_countries.rename(columns={"Country": "country"})

In [433]:
regions_and_countries = regions_and_countries.map(lambda x: x.strip() if isinstance(x, str) else x)

In [434]:
regions_and_countries

Unnamed: 0,country,Region
0,Albania,Europe
1,Andorra,Europe
2,Austria,Europe
3,Belarus,Europe
4,Belgium,Europe
...,...,...
4,Caspian Sea,Other
5,Pacific Ocean,Other
6,Red Sea,Other
0,Antarctica,Antarctica


In [None]:
regions_zeit = covered_by_zeit.merge(regions_and_countries, how="left", left_on="country", right_on="country")

In [None]:
regions_zeit.drop(index=83, inplace=True)

In [None]:
regions_zeit.drop(index=40, inplace=True)

In [None]:
regions_zeit["outlet"] = "Zeit"

In [None]:
regions_zeit["count"] = regions_zeit["count_of_articles_zeit"]

In [None]:
regions_zeit = regions_zeit[['country', 'iso_alpha3', 'middle_point', "count", 'count_of_articles_zeit', 'count_of_articles_nyt', 'Region', 'outlet']]

In [459]:
regions_nyt = nyt_country_coverage.merge(regions_and_countries, how="left", left_on="country", right_on="country")

In [460]:
regions_nyt["outlet"] = "NYT"

In [461]:
regions_nyt["count"] = regions_nyt["count_of_articles"]

In [462]:
regions_nyt

Unnamed: 0,country,ids_of_articles,count_of_articles,iso_alpha3,perc_of_coverage,Region,outlet,count
0,Adriatic Sea,['nyt://article/6c5919fe-452a-532d-8234-f698a3...,1,,0.005381,Europe,NYT,1
1,Aegean Sea,['nyt://interactive/6addaf57-b7fa-5d0f-af24-ec...,3,,0.016142,Europe,NYT,3
2,Afghanistan,['nyt://article/1bf3d7c8-419e-52d7-a720-c4e054...,112,,0.602637,Asia,NYT,112
3,Africa,['nyt://article/34e514ce-6ebc-58f1-bf4b-d750ea...,201,,1.081517,Africa,NYT,201
4,Albania,['nyt://article/59eb4293-7d0a-558f-9c26-d85c3f...,23,,0.123756,Europe,NYT,23
...,...,...,...,...,...,...,...,...
240,Vietnam,['nyt://article/670e1dfd-7217-5337-a5b8-0b7a7d...,52,,0.279796,Asia,NYT,52
241,Yellow Sea,['nyt://article/593fcdd3-0e07-58b1-9a5d-eec8cd...,1,,0.005381,Asia,NYT,1
242,Yemen,['nyt://article/083f2303-65dd-5db5-b970-cf1755...,73,,0.392790,Asia,NYT,73
243,Zambia,['nyt://article/ce946de8-b1a7-58a9-9235-321a9b...,13,,0.069949,Africa,NYT,13


In [463]:
regions_nyt = regions_nyt.drop_duplicates(subset=["country"])

In [464]:
regions_nyt

Unnamed: 0,country,ids_of_articles,count_of_articles,iso_alpha3,perc_of_coverage,Region,outlet,count
0,Adriatic Sea,['nyt://article/6c5919fe-452a-532d-8234-f698a3...,1,,0.005381,Europe,NYT,1
1,Aegean Sea,['nyt://interactive/6addaf57-b7fa-5d0f-af24-ec...,3,,0.016142,Europe,NYT,3
2,Afghanistan,['nyt://article/1bf3d7c8-419e-52d7-a720-c4e054...,112,,0.602637,Asia,NYT,112
3,Africa,['nyt://article/34e514ce-6ebc-58f1-bf4b-d750ea...,201,,1.081517,Africa,NYT,201
4,Albania,['nyt://article/59eb4293-7d0a-558f-9c26-d85c3f...,23,,0.123756,Europe,NYT,23
...,...,...,...,...,...,...,...,...
240,Vietnam,['nyt://article/670e1dfd-7217-5337-a5b8-0b7a7d...,52,,0.279796,Asia,NYT,52
241,Yellow Sea,['nyt://article/593fcdd3-0e07-58b1-9a5d-eec8cd...,1,,0.005381,Asia,NYT,1
242,Yemen,['nyt://article/083f2303-65dd-5db5-b970-cf1755...,73,,0.392790,Asia,NYT,73
243,Zambia,['nyt://article/ce946de8-b1a7-58a9-9235-321a9b...,13,,0.069949,Africa,NYT,13


In [475]:
regions_nyt

Unnamed: 0,country,count,Region,outlet
0,Adriatic Sea,1,Europe,NYT
1,Aegean Sea,3,Europe,NYT
2,Afghanistan,112,Asia,NYT
3,Africa,201,Africa,NYT
4,Albania,23,Europe,NYT
...,...,...,...,...
240,Vietnam,52,Asia,NYT
241,Yellow Sea,1,Asia,NYT
242,Yemen,73,Asia,NYT
243,Zambia,13,Africa,NYT


In [491]:
region_summary = regions_nyt.groupby('Region')['count'].sum().reset_index()
region_summary['percentage'] = (region_summary['count'] / region_summary['count'].sum()) * 100
region_summary

Unnamed: 0,Region,count,percentage
0,Africa,962,5.192702
1,Antarctica,4,0.021591
2,Asia,8647,46.674943
3,Europe,6328,34.1574
4,North America,1609,8.685091
5,Oceania,293,1.581561
6,Other,90,0.485804
7,South America,593,3.200907


In [476]:
regions_nyt = regions_nyt[['country', "count", 'Region', 'outlet']]

In [477]:
regions_nyt.columns

Index(['country', 'count', 'Region', 'outlet'], dtype='object')

In [479]:
len(regions_nyt)

236

In [478]:
regions_zeit.columns

Index(['country', 'iso_alpha3', 'middle_point', 'count',
       'count_of_articles_zeit', 'count_of_articles_nyt', 'Region', 'outlet'],
      dtype='object')

In [None]:
coverage_by_outlet_and_region = pd.concat([regions_zeit, regions_nyt])

### Create a nested JSON for Voronoi

In [480]:
out = regions_nyt.groupby(["outlet","Region"])[['country', 'count']].apply(lambda x: x.to_dict('records')).reset_index(name='children')

In [481]:
out = out.groupby(["outlet"])[['Region', 'children']].apply(lambda x: x.to_dict('records')).reset_index(name='children')

In [482]:
out = out.to_json(orient="records")


In [483]:
out = ast.literal_eval(out)

In [484]:
out

[{'outlet': 'NYT',
  'children': [{'Region': 'Africa',
    'children': [{'country': 'Africa', 'count': 201},
     {'country': 'Algeria', 'count': 12},
     {'country': 'Angola', 'count': 10},
     {'country': 'Benin', 'count': 3},
     {'country': 'Botswana', 'count': 6},
     {'country': 'Burkina Faso', 'count': 8},
     {'country': 'Burundi', 'count': 2},
     {'country': 'Cameroon', 'count': 6},
     {'country': 'Cape Verde', 'count': 2},
     {'country': 'Central African Republic', 'count': 1},
     {'country': 'Chad', 'count': 23},
     {'country': 'Comoros', 'count': 9},
     {'country': 'Republic of the Congo', 'count': 1},
     {'country': 'Ivory Coast', 'count': 8},
     {'country': 'Djibouti', 'count': 1},
     {'country': 'Equatorial Guinea', 'count': 1},
     {'country': 'Eritrea', 'count': 3},
     {'country': 'Eswatini', 'count': 1},
     {'country': 'Ethiopia', 'count': 20},
     {'country': 'Gabon', 'count': 2},
     {'country': 'Ghana', 'count': 19},
     {'country': '

In [485]:
final_shape = {"name":"coverage", "children":out}

In [486]:
final_shape

{'name': 'coverage',
 'children': [{'outlet': 'NYT',
   'children': [{'Region': 'Africa',
     'children': [{'country': 'Africa', 'count': 201},
      {'country': 'Algeria', 'count': 12},
      {'country': 'Angola', 'count': 10},
      {'country': 'Benin', 'count': 3},
      {'country': 'Botswana', 'count': 6},
      {'country': 'Burkina Faso', 'count': 8},
      {'country': 'Burundi', 'count': 2},
      {'country': 'Cameroon', 'count': 6},
      {'country': 'Cape Verde', 'count': 2},
      {'country': 'Central African Republic', 'count': 1},
      {'country': 'Chad', 'count': 23},
      {'country': 'Comoros', 'count': 9},
      {'country': 'Republic of the Congo', 'count': 1},
      {'country': 'Ivory Coast', 'count': 8},
      {'country': 'Djibouti', 'count': 1},
      {'country': 'Equatorial Guinea', 'count': 1},
      {'country': 'Eritrea', 'count': 3},
      {'country': 'Eswatini', 'count': 1},
      {'country': 'Ethiopia', 'count': 20},
      {'country': 'Gabon', 'count': 2},
   

In [488]:
import json

with open("./local/voronoi/nyt_coverage_by_region.json", "w") as json_file:
    json.dump(out, json_file)

In [None]:
fig = px.choropleth(country_coverage, 
    locations='iso_alpha3', 
    color='middle_point',
    color_continuous_scale="PiYG",
    range_color=(-0.01, 0.01),
    locationmode="ISO-3"
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
country_coverage.to_csv("../../data/places/coverage_by_country.csv", index=False)

## Retrieve related categories for each location

### NYT

In [None]:
full_year = pd.read_csv("../../input-data/temp-data.csv")

In [None]:
country_coverage

In [None]:
zeit_full_year = pd.read_csv("../../input-data/zeit-temp-data.csv")

In [None]:
zeit_full_year

In [None]:
full_year['keywords'] = full_year['keywords'].apply(ast.literal_eval)

In [None]:
zeit_full_year = zeit_full_year[zeit_full_year['keywords'].notna()]

In [None]:
zeit_full_year['keywords'] = zeit_full_year['keywords'].apply(ast.literal_eval)

In [None]:
zeit_full_year_exploded = zeit_full_year.explode("keywords")

In [None]:
zeit_full_year_exploded = zeit_full_year_exploded.rename(columns={"uri": "_id"})

In [None]:
zeit_full_year_exploded

In [None]:
zeit_full_year_exploded = zeit_full_year_exploded[["_id", "date", "url", "keywords"]]

In [None]:
zeit_full_year_exploded.rename(columns={"uri": "_id"}, inplace=True)

In [None]:
zeit_full_year_exploded

In [None]:
full_year_exploded = full_year.explode("keywords")

In [None]:
country_coverage["ids_of_articles_nyt"] = country_coverage["ids_of_articles_nyt"].apply(ast.literal_eval)

In [None]:
country_coverage["ids_of_articles_zeit"] = country_coverage["ids_of_articles_zeit"].apply(ast.literal_eval)

In [None]:
countries = country_coverage["country"].unique()

In [None]:
list_of_keywords_dicts = []
for country in countries:
    country_selection = country_coverage[country_coverage["country"] == country]
    nyt_ids = country_selection["ids_of_articles_nyt"].values
    nyt_ids = nyt_ids[0]
    keywords_per_country = []
    
    for id in nyt_ids:
        article = full_year_exploded[full_year_exploded["_id"] == id]
        keywords_list = article["keywords"]
        list_of_keywords_per_article = list(keywords_list.apply(lambda x: x.get("value")))
        keywords_per_country.extend(list_of_keywords_per_article)

    my_dict = {i:{keywords_per_country.count(i)} for i in keywords_per_country}
    list_of_keywords_dicts.append(my_dict)
    print(len(list_of_keywords_dicts))

In [None]:
country_coverage["keywords_nyt"] = list_of_keywords_dicts

In [None]:
country_coverage

In [None]:
country_coverage

In [None]:
zeit_full_year_exploded

In [None]:
list_of_zeit_keywords_dicts = []
for country in countries:
    print(country)
    country_selection = country_coverage[country_coverage["country"] == country]
    ids = country_selection["ids_of_articles_zeit"].values
    print(country_selection["ids_of_articles_zeit"].values)
    ids = ids[0]  # Assuming ids[0] is already a list
    zeit_keywords_per_country = []
    for id in ids:  # No need for eval() here
        article = zeit_full_year_exploded[zeit_full_year_exploded["_id"] == id]
        keyword = article["keywords"].values
        zeit_keywords_per_country.extend(keyword)
    
    my_dict = {i: zeit_keywords_per_country.count(i) for i in zeit_keywords_per_country}
    print("length of current dict", len(my_dict.keys()))
    list_of_zeit_keywords_dicts.append(my_dict)
    print("all dicts", len(list_of_zeit_keywords_dicts))

In [None]:
country_coverage["keywords_zeit"] = list_of_zeit_keywords_dicts
country_coverage

In [None]:
country_coverage["keywords_zeit"] = country_coverage["keywords_zeit"].apply(lambda x:{key: value for key, value in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
country_coverage["keywords_nyt"] = country_coverage["keywords_nyt"].apply(lambda x:{key: value for key, value in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
country_coverage.to_csv("../../data/places/coverage_by_country.csv", index=False)