In [1]:
import numpy as np
import pandas as pd
import matplotlib
import datetime
import time
from itertools import islice

In [2]:
def parse_timestamp(el):
    b = datetime.datetime.strptime(el, fmt)
    return b

In [3]:
fmt = '%d.%m.%Y'

In [4]:
def days_between(d1, d2):
    return abs((d2 - d1).days)

In [5]:
# Importing data on countermeasures
url = "Corona Dataset -newest version.csv"
df = pd.read_csv(url, sep=';', low_memory=False)

In [6]:
pd.set_option('display.max_columns()', None)

In [7]:
df.columns

Index(['entry_into_force', 'amusment_venues', 'cultural_venues', 'gastronomy',
       'essential_retail', 'non-essential_retail', 'major_events',
       'events_and_gatherings', 'public_gatherings', 'traceability', 'tourism',
       'traveling', 'administrative_fine', 'school_facilities',
       'university_facilities', 'sport_facilities', 'medical_facilities',
       'shopping_malls', 'work', 'mouth_and_nose_covering', 'curfew'],
      dtype='object')

In [8]:
df.drop(['amusment_venues', 'administrative_fine', 'shopping_malls', 'work'], axis=1, inplace=True)

In [9]:
df

Unnamed: 0,entry_into_force,cultural_venues,gastronomy,essential_retail,non-essential_retail,major_events,events_and_gatherings,public_gatherings,traceability,tourism,traveling,school_facilities,university_facilities,sport_facilities,medical_facilities,mouth_and_nose_covering,curfew
0,16.03.2020,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_limitations,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
1,23.03.2020,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_limitations,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
2,10.04.2020,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_quarantine,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
3,20.04.2020,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_nomedfac,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
4,01.05.2020,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_nomedfac,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
5,04.05.2020,closedw/exmz,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_nomedfac,closed,openw/meas,closed,limited_visits,not_mandatory,no
6,07.05.2020,closedw/exmz,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_quarantine,closed,openw/meas,closedw/expo,limited_visits,not_mandatory,no
7,11.05.2020,openw/lim_100,openw/meas,openw/meas,openw/meas,prohibited,pw/ex_concert_100,allowed_2h,no,prohibited,ra_14d_quarantine,openw/meas,openw/meas,openw/meas,allowed_visitsw/meas,not_mandatory,no
8,17.05.2020,openw/lim_100,openw/meas,openw/meas,openw/meas,prohibited,pw/ex_concert_100,allowed_2h,no,limited,o4_14d_quarantine,openw/meas,openw/meas,openw/meas,allowed_visitsw/meas,not_mandatory,no
9,30.05.2020,openw/lim_100,openw/meas,openw/meas,openw/meas,prohibited,pw/ex_concert_100,allowed_2h,yes,allowed_only4,o4_14d_quarantine,openw/meas,openw/meas,openw/meas,allowed_visitsw/meas,not_mandatory,no


In [10]:
# Adding a row at the begining of the data frame. Data will start from January 2020
df.loc[-1] = ['01.01.2020', 'open', 'open', 'open', 'open', 'allowed',
              'allowed', 'allowed', 'no', 'allowed', 'allowed', 'open', 
              'open', 'open', 'open', 'not_mandatory', 'no']  # adding a row
df.index = df.index + 1  # shifting index
df = df.sort_index()  # sorting by index

In [11]:
# Adding a row at the end of the data frame. Data will end with July 2021
df.loc[42] = ['31.07.2021', 'openw/meas', 'openw/meas', 'openw/tst_vacc', 'openw/lim_ppl', 'sport_1000',
              'allowed_1000max', 'allowed_100pplw/tst_imm', 'yes', 'allowed', 'ra_testM_vac_10dqua', 'open', 
              'open', 'openw/limit', 'limited_visitsw/mask_test', 'mandatory_medical', 'no']

In [12]:
df['entry_into_force'] = df['entry_into_force'].map(parse_timestamp)

In [13]:
print(df['entry_into_force'][0].day)
print(df['entry_into_force'][0].month)
print(df['entry_into_force'][0].year)

1
1
2020


In [14]:
# Creating a new data frame that will be filled with regulations for all dates between Jan 2020 and July 2021
new_df = pd.DataFrame(columns= ['entry_into_force', 'cultural_venues', 'gastronomy',
       'essential_retail', 'non-essential_retail', 'major_events',
       'events_and_gatherings', 'public_gatherings', 'traceability', 'tourism',
       'traveling', 'school_facilities',
       'university_facilities', 'sport_facilities', 'medical_facilities', 'mouth_and_nose_covering', 'curfew'] ) #empty dataframe

In [15]:
row_a = df.iloc[0]
new_df = new_df.append(row_a)

In [16]:
for index, row in islice(df.iterrows(), 1, None):
    n = days_between(row_a['entry_into_force'], row['entry_into_force'])
    for i in range(1,n):        
        new_df = new_df.append({ 'entry_into_force' : row_a['entry_into_force'] + datetime.timedelta(days=1), 
                'cultural_venues' : row_a['cultural_venues'], 
                'gastronomy' : row_a['gastronomy'], 'essential_retail' : row_a['essential_retail'], 
                'non-essential_retail' : row_a['non-essential_retail'], 'major_events' : row_a['major_events'], 
                'events_and_gatherings' : row_a['events_and_gatherings'], 
                'public_gatherings' : row_a['public_gatherings'], 'traceability' : row_a['traceability'], 
                'tourism' : row_a['tourism'],
                'traveling' : row_a['traveling'], 
                'school_facilities' : row_a['school_facilities'],
                'university_facilities' : row_a['university_facilities'], 'sport_facilities' : row_a['sport_facilities'], 
                'medical_facilities' : row_a['medical_facilities'],
                'mouth_and_nose_covering' : row_a['mouth_and_nose_covering'], 
                'curfew' : row_a['curfew']}, ignore_index=True)
        row_a = new_df.iloc[-1]
    new_df = new_df.append(row)
    row_a = row

In [17]:
new_df.reset_index(drop= True, inplace=True)

In [18]:
new_df.loc[70:120]

Unnamed: 0,entry_into_force,cultural_venues,gastronomy,essential_retail,non-essential_retail,major_events,events_and_gatherings,public_gatherings,traceability,tourism,traveling,school_facilities,university_facilities,sport_facilities,medical_facilities,mouth_and_nose_covering,curfew
70,2020-03-11,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
71,2020-03-12,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
72,2020-03-13,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
73,2020-03-14,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
74,2020-03-15,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
75,2020-03-16,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_limitations,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
76,2020-03-17,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_limitations,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
77,2020-03-18,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_limitations,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
78,2020-03-19,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_limitations,closed,closedw/exlib,closed,limited_visits,not_mandatory,no
79,2020-03-20,closed,closedw/dp,openw/meas,closedw/delivery,prohibited,prohibited,prohibited_over_2,no,prohibited,ra_14d_limitations,closed,closedw/exlib,closed,limited_visits,not_mandatory,no


In [63]:
new_df = new_df.rename({'entry_into_force':'date'},axis=1)

In [64]:
new_df

Unnamed: 0,date,cultural_venues,gastronomy,essential_retail,non-essential_retail,major_events,events_and_gatherings,public_gatherings,traceability,tourism,traveling,school_facilities,university_facilities,sport_facilities,medical_facilities,mouth_and_nose_covering,curfew
0,2020-01-01,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
1,2020-01-02,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
2,2020-01-03,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
3,2020-01-04,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
4,2020-01-05,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,2021-07-27,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no
574,2021-07-28,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no
575,2021-07-29,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no
576,2021-07-30,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no


In [43]:
# Importing data on infection numbers -daily new infections format
url2 = "Cologne numbers.csv"
df_num = pd.read_csv(url2, sep=',', low_memory=False)

In [44]:
df_num = df_num.iloc[::-1]
df_num.reset_index(drop=True, inplace=True)

In [45]:
# Adding a row at the begining of the data frame. Data will start from January 2020
df_num.loc[-1] = ['01.01.2020', '0']  # adding a row
df_num.index = df_num.index + 1  # shifting index
df_num = df_num.sort_index()  # sorting by index

In [50]:
df_num

Unnamed: 0,date,new_infections
0,01.01.2020,0
1,21.02.2020,0
2,22.02.2020,0
3,23.02.2020,0
4,24.02.2020,0
...,...,...
588,30.09.2021,101
589,01.10.2021,78
590,02.10.2021,105
591,03.10.2021,61


In [47]:
df_num['new_infections'][2]

'-'

In [48]:
df_num['new_infections'] = df_num['new_infections'].replace('-', 0)

In [49]:
df_num['new_infections'].value_counts(dropna=False)

0      14
6      14
18     13
3      11
2      10
       ..
110     1
230     1
313     1
221     1
160     1
Name: new_infections, Length: 254, dtype: int64

In [51]:
def to_integer(s):
    value = int(s)
    return value

In [52]:
df_num['new_infections'] = df_num['new_infections'].map(to_integer)

In [53]:
df_num['date'] = df_num['date'].map(parse_timestamp)

In [54]:
type(df_num['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [55]:
for index, row in df_num.iterrows():
    if (row['date'].year == 2021) and (row['date'].month > 7):
        df_num.drop(index, inplace=True)

In [56]:
new_num = pd.DataFrame(columns= ['date', 'new_infections'])

In [57]:
row_b = df_num.iloc[0]
new_num = new_num.append(row_b)

In [58]:
for index, row in islice(df_num.iterrows(), 1, None):
    n = days_between(row_b['date'], row['date'])
    for i in range(1,n):        
        new_num = new_num.append({'date' : row_b['date'] + datetime.timedelta(days=1), 
                'new_infections' : row_b['new_infections']}, ignore_index=True)
        row_b = new_num.iloc[-1]
    new_num = new_num.append(row)
    row_b = row

In [38]:
df_num[:20]

Unnamed: 0,date,new_infections
0,2020-01-01,0
1,2020-02-21,0
2,2020-02-22,0
3,2020-02-23,0
4,2020-02-24,0
5,2020-02-25,0
6,2020-02-26,0
7,2020-02-27,1
8,2020-02-28,0
9,2020-02-29,2


In [60]:
new_num.reset_index(drop=True, inplace=True)

In [61]:
new_num

Unnamed: 0,date,new_infections
0,2020-01-01,0
1,2020-01-02,0
2,2020-01-03,0
3,2020-01-04,0
4,2020-01-05,0
...,...,...
573,2021-07-27,82
574,2021-07-28,65
575,2021-07-29,74
576,2021-07-30,50


In [66]:
complete_df = pd.merge(new_df, new_num, on='date', how='outer')

In [67]:
complete_df.columns

Index(['date', 'cultural_venues', 'gastronomy', 'essential_retail',
       'non-essential_retail', 'major_events', 'events_and_gatherings',
       'public_gatherings', 'traceability', 'tourism', 'traveling',
       'school_facilities', 'university_facilities', 'sport_facilities',
       'medical_facilities', 'mouth_and_nose_covering', 'curfew',
       'new_infections'],
      dtype='object')

In [None]:
com