In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import textwrap
import json
import os
import plotly.express as px
import plotly.graph_objects as go
import folium as fl

In [2]:
data_weather = pd.read_csv('../data/raw/weather/weather.csv', sep = "\t")
data_corona_de = pd.read_csv("../data/raw/corona/de_corona.csv", sep = "\t")

In [3]:
print(data_weather.shape)
data_weather.head()

(14904, 9)


Unnamed: 0,date,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,Totalprecipitation,UVIndex,WindSpeed
0,2020-02-13,DE-BB,76.337444,1824290.0,2403341.0,276.551573,0.003355,2.777806,4.542822
1,2020-02-13,DE-BE,76.065297,1786373.0,2408182.0,276.844633,0.003523,4.671329,4.761509
2,2020-02-13,DE-BW,80.113988,1505760.0,2290158.0,276.227143,0.008013,4.268546,4.467024
3,2020-02-13,DE-BY,81.554346,2363013.0,2275361.0,275.583053,0.005227,4.417797,3.677414
4,2020-02-13,DE-HB,87.167414,8389.756,2406940.0,276.237452,0.007715,1.794872,4.699573


In [4]:
print(list(data_weather.columns))
np.unique(data_weather["iso3166-2"])

['date', 'iso3166-2', 'RelativeHumiditySurface', 'SolarRadiation', 'Surfacepressure', 'TemperatureAboveGround', 'Totalprecipitation', 'UVIndex', 'WindSpeed']


array(['DE-BB', 'DE-BE', 'DE-BW', 'DE-BY', 'DE-HB', 'DE-HE', 'DE-HH',
       'DE-MV', 'DE-NI', 'DE-NW', 'DE-RP', 'DE-SH', 'DE-SL', 'DE-SN',
       'DE-ST', 'DE-TH', 'DK-81', 'DK-82', 'DK-83', 'DK-84', 'DK-85',
       'NL-DR', 'NL-FL', 'NL-FR', 'NL-GE', 'NL-GR', 'NL-LI', 'NL-NB',
       'NL-NH', 'NL-OV', 'NL-UT', 'NL-ZE', 'NL-ZH', 'SE-AB', 'SE-AC',
       'SE-BD', 'SE-C', 'SE-D', 'SE-E', 'SE-F', 'SE-G', 'SE-H', 'SE-I',
       'SE-K', 'SE-M', 'SE-N', 'SE-O', 'SE-S', 'SE-T', 'SE-U', 'SE-W',
       'SE-X', 'SE-Y', 'SE-Z'], dtype=object)

Sanity check

In [5]:
count = 0
for i in data_weather:
    #print(sum(pd.isnull(data_weather[i])))
    if sum(pd.isnull(data_weather[i])) != 0:
        count += 1
print(count)

0


Filtering for germany

In [6]:
de_codes = ['DE-BB', 'DE-BE', 'DE-BW', 'DE-BY', 'DE-HB', 'DE-HE', 'DE-HH', 'DE-MV', 'DE-NI', 'DE-NW', 
            'DE-RP', 'DE-SH', 'DE-SL', 'DE-SN', 'DE-ST', 'DE-TH']
mask = np.isin(data_weather, de_codes)
data_weather_de = data_weather[mask]

print(data_weather_de.shape)
np.unique(data_weather_de["iso3166-2"])

(4416, 9)


array(['DE-BB', 'DE-BE', 'DE-BW', 'DE-BY', 'DE-HB', 'DE-HE', 'DE-HH',
       'DE-MV', 'DE-NI', 'DE-NW', 'DE-RP', 'DE-SH', 'DE-SL', 'DE-SN',
       'DE-ST', 'DE-TH'], dtype=object)

In [7]:
# saving the filtered dataset for germany
def saving_csv(file, name):
    file.to_csv(f'../data/interim/weather_{name}.csv')

In [8]:
saving_csv(data_weather_de, "germany")

In [9]:
print(data_corona_de.shape)
data_corona_de.head()

(5602, 4)


Unnamed: 0,date,region_code,confirmed_addition,deceased_addition
0,2020-01-02,Nordrhein-Westfalen,1,0
1,2020-01-07,Nordrhein-Westfalen,1,0
2,2020-01-09,Nordrhein-Westfalen,1,1
3,2020-01-12,Nordrhein-Westfalen,1,0
4,2020-01-14,Nordrhein-Westfalen,1,0


# min, mean, median, and max of regions

In [32]:
# masking for different regions
region_names = ['DE_BB', 'DE_BE', 'DE_BW', 'DE_BY', 'DE_HB', 'DE_HE', 'DE_HH', 'DE_MV', 'DE_NI', 'DE_NW', 
            'DE_RP', 'DE_SH', 'DE_SL', 'DE_SN', 'DE_ST', 'DE_TH']
for codes, names in zip(de_codes, de_codes):
    mask = np.isin(data_weather_de, codes)
    region_data = data_weather_de[mask]
    saving_csv(region_data, names)

In [25]:
# filter for numerical columns, they are from index 2 and onwards
weather_columns = list(data_weather.columns)
print(weather_columns)
weather_numerical_columns = weather_columns[2:9]
weather_numerical_columns

['date', 'iso3166-2', 'RelativeHumiditySurface', 'SolarRadiation', 'Surfacepressure', 'TemperatureAboveGround', 'Totalprecipitation', 'UVIndex', 'WindSpeed']


['RelativeHumiditySurface',
 'SolarRadiation',
 'Surfacepressure',
 'TemperatureAboveGround',
 'Totalprecipitation',
 'UVIndex',
 'WindSpeed']

In [53]:
dict_sum_numeric = {}
for region in de_codes:
    data = pd.read_csv(f'../data/interim/weather_{region}.csv')
    dict_sum_numeric.update({region:{}})
    for col in range(2,len(list(data_weather.columns))):
        dict_sum_numeric[region].update({list(data_weather.columns)[col]:[min(data[list(data_weather.columns)[col]]), 
                                                             data[list(data_weather.columns)[col]].mean(), 
                                                             data[list(data_weather.columns)[col]].median(), 
                                                            max(data[list(data_weather.columns)[col]])]})
        #min, mean, median, and max
        #print(list(data_weather.columns)[col], region)
        #print("min:", min(data[list(data_weather.columns)[col]]), "max", max(data[list(data_weather.columns)[col]]), "mean:", data[list(data_weather.columns)[col]].mean(), "median:", data[list(data_weather.columns)[col]].median())
print(dict_sum_numeric["DE-BB"])

{'RelativeHumiditySurface': [40.944275, 68.99345724637683, 68.0675535, 95.670552], 'SolarRadiation': [17.55932, 8306464.1144477995, 7952489.525829, 22172374.595029], 'Surfacepressure': [2366659.966859, 2419063.357124196, 2419660.6139845, 2474949.5719310003], 'TemperatureAboveGround': [273.295669, 286.418136576087, 286.50438199999996, 300.353165], 'Totalprecipitation': [0.0, 0.0017139999999999992, 0.00044350000000000005, 0.024186000000000003], 'UVIndex': [0.088878, 20.03117598913044, 20.0102935, 45.355762], 'WindSpeed': [1.253825, 3.1552094275362332, 2.901368, 7.414942]}


In [73]:
fig = go.Figure(data=go.Bar(y=dict_sum_numeric["DE-BB"]["WindSpeed"], x = ["min", "mean", "median", "max"]))
fig.show()

In [76]:
geo = pd.read_file("../data/raw/shapefiles/de.csv")
geo

AttributeError: module 'pandas' has no attribute 'read_file'