In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import textwrap
import json
import os
import plotly.express as px
import plotly.graph_objects as go
import folium as fl
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr

# Exercise 1

In [None]:
data_weather_1 = pd.read_csv('../data/raw/weather/weather.csv', sep = "\t")
data_weather_2 = pd.read_csv('../data/raw/weather/weather2.csv', sep = "\t")
data_corona_de = pd.read_csv("../data/raw/corona/de_corona.csv", sep = "\t")
with open("../data/raw/metadata/de_metadata.json") as f:
    data_meta_de = json.load(f)
with open("../data/raw/shapefiles/de.geojson") as f:
    data_geo = json.load(f)

In [None]:
print(data_weather_1.shape)
data_weather_1.head()

In [None]:
print(list(data_weather_1.columns))
np.unique(data_weather_1["iso3166-2"])

In [None]:
frames = [data_weather_1, data_weather_2]
data_weather = pd.concat(frames)
data_weather.shape

Sanity check

In [None]:
count = 0
for i in data_weather:
    #print(sum(pd.isnull(data_weather[i])))
    if sum(pd.isnull(data_weather[i])) != 0:
        count += 1
print(count)

Filtering for germany

In [None]:
de_codes = ['DE-BB', 'DE-BE', 'DE-BW', 'DE-BY', 'DE-HB', 'DE-HE', 'DE-HH', 'DE-MV', 'DE-NI', 'DE-NW', 
            'DE-RP', 'DE-SH', 'DE-SL', 'DE-SN', 'DE-ST', 'DE-TH']
mask = np.isin(data_weather, de_codes) 
data_weather_de = data_weather[mask]

print(data_weather_de.shape)
np.unique(data_weather_de["iso3166-2"])

In [None]:
# saving the filtered dataset for germany
def saving_csv(file, name):
    file.to_csv(f'../data/interim/weather_{name}.csv')

In [None]:
saving_csv(data_weather_de, "germany")

In [None]:
print(data_corona_de.shape)
data_corona_de.head()

# min, mean, median, and max of regions

In [None]:
# masking for different regions
region_names = ['DE_BB', 'DE_BE', 'DE_BW', 'DE_BY', 'DE_HB', 'DE_HE', 'DE_HH', 'DE_MV', 'DE_NI', 'DE_NW', 
            'DE_RP', 'DE_SH', 'DE_SL', 'DE_SN', 'DE_ST', 'DE_TH']
for codes, names in zip(de_codes, de_codes):
    mask = np.isin(data_weather_de, codes)
    region_data = data_weather_de[mask]
    saving_csv(region_data, names)

In [None]:
# filter for numerical columns, they are from index 2 and onwards
weather_columns = list(data_weather.columns)
print(weather_columns)
weather_numerical_columns = weather_columns[2:9]
weather_numerical_columns

In [None]:
dict_sum_numeric = {}
for region in de_codes:
    data = pd.read_csv(f'../data/interim/weather_{region}.csv')
    dict_sum_numeric.update({region:{}})
    for col in range(2,len(list(data_weather.columns))):
        dict_sum_numeric[region].update({list(data_weather.columns)[col]:[min(data[list(data_weather.columns)[col]]), 
                                                             data[list(data_weather.columns)[col]].mean(), 
                                                             data[list(data_weather.columns)[col]].median(), 
                                                            max(data[list(data_weather.columns)[col]])]})
print(dict_sum_numeric["DE-BB"])

In [None]:
fig = go.FigureWidget(data=go.Bar(y=dict_sum_numeric["DE-BB"]["WindSpeed"], x = ["min", "mean", "median", "max"]))
fig

# Exercise 2

In [None]:
#create dictionary to convert the region code into the region name
dic_convert = {data_meta_de["country_metadata"][i]["iso3166-2_name_en"].replace("Ã¼", "ü"):
               data_meta_de["country_metadata"][i]["iso3166-2_code"]
              for i in range(len(data_meta_de["country_metadata"]))
              }
#insert a column into corona dataframe with the region name as well
data_corona_de["region"] = data_corona_de["region_code"].map(dic_convert)

#create a new dataframe having region code and the confirmed cases
data_corona_region = data_corona_de.groupby(by = "region")["confirmed_addition"].sum().reset_index()

#create a dictionary having the total population for each region
population_map = {data_meta_de["country_metadata"][i]["iso3166-2_code"]:
                  data_meta_de["country_metadata"][i]["population"] 
                  for i in range(len(data_meta_de["country_metadata"]))
                 }
#insert a column population into data coron by region
data_corona_region["population"] = data_corona_region["region"].map(population_map)

#insert a coulm that cases/population
data_corona_region["relation"] = data_corona_region["confirmed_addition"]/data_corona_region["population"]*100

data_corona_region

In [None]:
#create the folium.Choropleth
m = fl.Map(
location = [51.3, 10.3],
    zoom_start = 6,)

fl.Choropleth(
    geo_data = data_geo,
    name = "cases",
    data = data_corona_region,
    columns = ["region", "relation"],
    key_on = "properties.iso_3166_2",
    fill_color = "OrRd",
    fill_opacity = .6,
    line_opacity = .5,
    legend_name = "Number of Cases",
).add_to(m)

m

# Exercise 03

In [None]:
df = data_corona_de.merge(data_weather, left_on = ["date", "region"], right_on =  ["date", "iso3166-2"])
df = df.drop(["iso3166-2"], axis = 1)
df.head()

Pearson correlation with log

In [None]:
Xs = ['RelativeHumiditySurface', 'SolarRadiation','Surfacepressure', 'TemperatureAboveGround', 'Totalprecipitation',
       'UVIndex', 'WindSpeed']
significance_threshold = 0.001 / (len(Xs)*3)
corrs = []
pvalues = []
for var in Xs:
    corr, pvalue = pearsonr(np.log(df["confirmed_addition"]), df[var])
    print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

Spearman correlation with log

In [None]:
for var in Xs:
    corr, pvalue = spearmanr(np.log(df["confirmed_addition"]), df[var])
    print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

Thought it could be interesting to see if summing up x days could result in another correlation
and since it can take up to 14 days before symptoms of covid-19 can be seen, it would make sense to
see what happened during those 14 days.

In [None]:
def rolling_sum(num_days):
    run = ['RelativeHumiditySurface', 'SolarRadiation','Surfacepressure', 'TemperatureAboveGround', 'Totalprecipitation',
           'UVIndex', 'WindSpeed']
    df_rolling_x = data_corona_de.merge(data_weather, left_on = ["date", "region"], right_on =  ["date", "iso3166-2"])
    df_rolling_x = df_rolling_x.drop(["iso3166-2"], axis = 1)
    for variation in run:
        new = []
        for i in range(len(list(df_rolling_x[variation]))):
            if i == 0:
                new.append(list(df_rolling_x[variation])[i])
                continue
            if i < num_days and i > 0 :
                new.append(sum(list(df_rolling_x[variation])[0:i+1]))
                continue
            new.append(sum(list(df_rolling_x[variation])[i-num_days:i]))
        df_rolling_x[variation] = new
    return df_rolling_x

In [None]:
df_rolling_7 = rolling_sum(7)
df_rolling_14 =  rolling_sum(14)

Creates summary for three spearman correlations with log on the cases

In [None]:
for var in Xs:
    corr, pvalue = spearmanr(np.log(df["confirmed_addition"]), df[var])
    corr7, pvalue7 = spearmanr(np.log(df_rolling_7["confirmed_addition"]), df_rolling_7[var])
    corr14, pvalue14 = spearmanr(np.log(df_rolling_14["confirmed_addition"]), df_rolling_14[var])
    print(f"{var}:\nCorrelation coefficient: (standard: {corr:.3f}, 7 days: {corr7:.3f}, 14 days: {corr14:.3f})\nPvalue: (standard: {pvalue}, 7 days: {pvalue7}, 14 days: {pvalue14})\nHolds the significance threshold: (standard: {pvalue < significance_threshold}, 7 days: {pvalue7 < significance_threshold}, 14 days: {pvalue14 < significance_threshold})\n")

Multivariable correlation

In [None]:
 Xs.append("const")

In [None]:
def multivariable_correlation(data):
    data = sm.add_constant(data)
    est = sm.OLS(np.log(data["confirmed_addition"]), data[Xs], hsconst = True).fit()
    return est.summary()

In [None]:
print(multivariable_correlation(df_rolling_7))