# Monkeypox

Mit dem Skript lassen sich Daten für den Artikel zu Affenpocken manuell aufbereiten. Das automatisierte Skript ist auf Github unter st-methods/monkeypox abgelegt. <br>
LD-Artikel: https://edit.nzz.ch/p/nzz/articles/1685302/edit/canvas

Quelle Github: https://github.com/globaldothealth/monkeypox


In [None]:
import pandas as pd
import webbrowser
import pycountry
import gettext
from datetime import datetime
import os
import numpy as np

In [None]:
german = gettext.translation('iso3166', pycountry.LOCALES_DIR, languages=['de'])
german.install()

## DF einlesen

In [None]:
# read google sheet
sheet_id = '1CEBhao3rMe-qtCbAgJTn5ZKQMRFWeAeaiXFpBY3gbHE'
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv"
df_roh = pd.read_csv(url)

In [None]:
# Pivot wider
df = df_roh.groupby(['Country','Status'])['ID'].count().unstack().reset_index()
df= df.rename(columns={'confirmed':'Bestätigt', 'suspected':'Verdacht'})

df.drop('discarded', inplace=True, axis=True)

# add totals column
df['Total']=df.iloc[:,1:].sum(axis=True)

df = df[df['Total']!=0].reset_index(drop = True)

In [None]:

# format integers
df = df.fillna(0)
df.iloc[:,1:] = df.iloc[:,1:].astype(int)
df['Country'] = df['Country'].str.strip()

In [None]:
# sum up UK and France
def sum_up_countries(df, subcountries, new_name):
    country_idx = df[df['Country'].isin(subcountries)].index
    country_row = df.iloc[country_idx].sum()
    country_row.loc['Country'] = new_name
    df =df.append(country_row, ignore_index=True)
    df.drop(country_idx, inplace=True)
    return df.reset_index(drop=True)
    
df = sum_up_countries(df, ['England', 'Scotland', 'Wales', 'Northern Ireland','United Kingdom'], 'United Kingdom' )
df = sum_up_countries(df, ['France', 'French Guiana'], 'France' )

In [None]:
df=df.sort_values('Total', ascending=False)

# replace country names with our worldmap ids
df['Country'] = df['Country'].str.replace('Czech Republic', 'Czechia')
df['Country'] = df['Country'].str.replace('Iran', 'Iran, Islamic Republic of')


In [None]:
# temporarily remove kosovo

df = df[df['Country']!='Kosovo']

In [None]:
ignore_lst_pycountry = ['Kosovo']

# Catch country name if not in pycountry
for name in df['Country'].to_list():
    if name in ignore_lst_pycountry:
        continue
    if pycountry.countries.get(name=name)== None and pycountry.countries.get(common_name =name) == None:
        #print(pycountry.countries.get(common_name =name).common_name)
        raise ValueError('Country is not recognised by pycountry:', name)

In [None]:
#df

### Deutsche Ländernamen, Flaggen

In [None]:
# get country object, using common name if there is one
def get_common_name(name):
    if pycountry.countries.get(common_name =name) == None: 
        country = pycountry.countries.get(name=name)
    else:
        country = pycountry.countries.get(common_name =name)
    return country

# return german name
def get_german_name(name):
    country = get_common_name(name)
    if pycountry.countries.get(common_name =name) == None:
        country = country.name
    else:
        country = country.common_name        
    return _(country)

# return flag
def get_flag(name):
    country = get_common_name(name)
    return country.flag


In [None]:
df['Land'] = df['Country'].apply(get_german_name)
df['Flagge'] =  df['Country'].apply(get_flag)

# Iran zurückwandeln
df['Country'] = df['Country'].str.replace('Iran, Islamic Republic of', 'Iran')
df['Country'] = df['Country'].str.replace('Iran, Islamic Republic of', 'Iran')

df = df[['Country', 'Land', 'Flagge','Bestätigt', 'Verdacht', 'Total']]

In [None]:
#df

## Vergleichen, Version abspeichern

In [None]:
past_csvs= os.listdir('data')

past_csvs.sort()
print(past_csvs)

In [None]:
df_last_version = pd.read_csv('data/'+past_csvs[-1])

In [None]:
print('old == new:', str(df['Total'].sum() == df_last_version['Total'].sum()))
print('new total:', str(df['Total'].sum()))
print('old total:', str(df_last_version['Total'].sum()))

In [None]:
# save most recent data
time_now = datetime.now().strftime("%Y%m%d-%Hh%M")

df.to_csv('data/'+ time_now+'.csv', index=False)

## Export für Weltkarte
https://q.st.nzz.ch/editor/choropleth/4acf1a0fd4dd89aef4abaeefd0b6f4dc

In [None]:
ids = pd.read_csv('country_ids.csv')

df_worldmap =df.copy()
#df_worldmap = df_worldmap[df_worldmap['Country']!='Malta']
df_worldmap['Country'] = df_worldmap['Country'].str.replace('United States','United States of America')
df_worldmap['Country'] = df_worldmap['Country'].str.replace('Bahamas','The Bahamas')
df_worldmap = df_worldmap.rename(columns ={'Country':'ID', 'Total':'Wert'})

# merge df with ids
df_worldmap = ids.merge(df_worldmap[['ID', 'Wert']], how='left').sort_values('Wert', ascending=False)

In [None]:
# Check if all countries were recognised - is True if ok

ignore_lst = ['Cayman Islands', 'Gibraltar']
total_df = df[~df['Country'].isin(ignore_lst)]['Total'].sum()

print('Sum Worldmap:', str(df_worldmap['Wert'].sum()))
#ok if it's not Malta
print('Sum DF:', str(total_df))
print('Corresponds with sum df:', str(df_worldmap['Wert'].sum() == total_df))

In [None]:
# copy to q-element
df_worldmap=df_worldmap.sort_values('ID', key=lambda col: col.str.lower())

df_worldmap.to_clipboard(index=False)
#webbrowser.open('https://q.st.nzz.ch/editor/choropleth/4acf1a0fd4dd89aef4abaeefd0b6f4dc', new=2)

### Differenzen finden

In [None]:
df[df['Total']==1].sort_values('Country')


In [None]:
df_worldmap[df_worldmap['Wert']==1].sort_values('ID')

## Export für Tabelle

https://q.st.nzz.ch/editor/table/4acf1a0fd4dd89aef4abaeefd0da5ac6

In [None]:
df_q_table = df[['Land', 'Flagge', 'Bestätigt', 'Verdacht','Total']].rename(columns = {'Land':'', 'Flagge':''})
#df_q_table

In [None]:
df_q_table.to_clipboard(index=False)

#webbrowser.open('https://q.st.nzz.ch/editor/table/4acf1a0fd4dd89aef4abaeefd0da5ac6')

## Zahlen für LD-Artikel

In [None]:
# Fälle total (inklusive inoffizielle Länder)
df['Total'].sum()

## Export Geschlechtergrafik [inaktiv]
https://q.st.nzz.ch/editor/chart/4913f749b598fb2ecc9721cb17e708e9

In [None]:
gender_dict = {'Male':'Männer', 'male':'Männer',
              'female':'Frauen',np.nan:'Keine Daten'}

In [None]:
df_gender = df_roh.copy()
df_gender['Gender'] = df_gender['Gender'].replace(gender_dict)

# only group confirmed cases
df_q_gender = df_gender[df_gender['Status']=='confirmed'].groupby(['Gender'], dropna=False, as_index=False).size()
df_q_gender.rename(columns={'Gender':'Geschlecht', 'size':'Anzahl'})

df_q_gender = df_q_gender.T
df_q_gender.columns = df_q_gender.iloc[0]
df_q_gender.drop('Gender', inplace=True)
df_q_gender = df_q_gender[['Männer', 'Keine Daten', 'Frauen']]

In [None]:
df_q_gender.to_clipboard()
#webbrowser.open('https://q.st.nzz.ch/editor/chart/4913f749b598fb2ecc9721cb17e708e9')

In [None]:
df_q_gender

In [None]:
df_q_gender.sum(axis=True)

## Test: Zeitreihe

In [None]:

df_roh['Date_onset'] = pd.to_datetime(df_roh['Date_onset'])
df_roh['Date_confirmation'] = pd.to_datetime(df_roh['Date_confirmation'])
df_roh['Date_entry'] = pd.to_datetime(df_roh['Date_entry'])
df_roh['Date_last_modified'] = pd.to_datetime(df_roh['Date_last_modified'])

In [None]:
import altair as alt

### Rollender Durchschnitt

1. Summe pro Tag
2. Rollender Schnitt pro Woche

In [None]:
df_cases_per_day = df_roh.groupby(df_roh['Date_entry'].dt.date).size().reset_index()
df_cases_per_day.columns = ['Date_entry', 'Cases']
df_cases_per_day['Date_entry'] = pd.to_datetime(df_cases_per_day['Date_entry'], format='%Y-%m-%d')

In [None]:
df_cases_per_day['Rolling_mean'] =df_cases_per_day['Cases'].rolling(7, min_periods=3).mean()
df_cases_per_day

In [None]:
width = 500
height =width * (9/16)

line = alt.Chart(df_cases_per_day).mark_line().encode(
    x='Date_entry:T',
    y='Rolling_mean',
    color = alt.value('#86341b')
).properties(
    width = width,
    height = height, 
    title = 'Anzahl Fälle nach Meldedatum'
)

In [None]:
bars = alt.Chart(df_roh[df_roh['Status'].isin(['confirmed', 'suspected'])]).mark_bar(size=10).encode(
    x=alt.X('Date_entry:T'  ),
    y=alt.Y('count()'), 
    color = alt.Color('Status', scale=alt.Scale(
        domain=['confirmed', 'suspected'],
        range=['#e66e4a', '#e7a18d'])),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'Status',
      sort='ascending'
    )
).properties(
    width = width,
    height = height, 
    title = 'Anzahl Fälle nach Meldedatum', 
)

In [None]:
bars + line

## Debugging

* Replace ISO3 with country code that is not recognised
* Then copy name and use it to replace current country name at line 35 in py-skript.

### Länder finden für Pycountry

In [None]:

ISO3 = 'XXK' 

pycountry.countries.get(alpha_3=ISO3)

In [None]:
pycountry.countries.get(alpha_3='BOL').common_name

In [None]:
pycountry.countries.get(name='Mexico')

### Länder finden für Worldmap

In [None]:
ids[ids['ID'].str.contains('Kosovo')]

In [None]:
df_worldmap[df_worldmap['ID'].str.contains('Malta')]

In [None]:
one_value = ['Mexico', 'Ecuador', 'Pakistan', 'Bolivia','Malta', 'Sudan', 'Austria', 'Finland']
df_worldmap[df_worldmap['ID'].isin(one_value)]