In [1]:
import numpy as np
import pandas as pd
import matplotlib
import datetime
import time
from itertools import islice

In [2]:
def parse_timestamp(el):
    b = datetime.datetime.strptime(el, fmt)
    return b

In [3]:
fmt = '%d.%m.%Y'

In [4]:
# Importing data on countermeasures
url = "covid19_unified_set.csv"
complete_df = pd.read_csv(url, sep=';', low_memory=False)

In [5]:
pd.set_option('display.max_columns()', None)

In [6]:
complete_df

Unnamed: 0,date,cultural_venues,gastronomy,essential_retail,non-essential_retail,major_events,events_and_gatherings,public_gatherings,traceability,tourism,traveling,school_facilities,university_facilities,sport_facilities,medical_facilities,mouth_and_nose_covering,curfew,new_infections
0,2020-01-01,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no,0
1,2020-01-02,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no,0
2,2020-01-03,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no,0
3,2020-01-04,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no,0
4,2020-01-05,open,open,open,open,allowed,allowed,allowed,no,allowed,allowed,open,open,open,open,not_mandatory,no,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,2021-07-27,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no,82
574,2021-07-28,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no,65
575,2021-07-29,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no,74
576,2021-07-30,openw/meas,openw/meas,openw/tst_vacc,openw/lim_ppl,sport_1000,allowed_1000max,allowed_100pplw/tst_imm,,allowed,ra_testM_vac_10dqua,open,open,openw/limit,limited_visitsw/mask_test,mandatory_medical,no,50


In [7]:
complete_df.columns

Index(['date', 'cultural_venues', 'gastronomy', 'essential_retail',
       'non-essential_retail', 'major_events', 'events_and_gatherings',
       'public_gatherings', 'traceability', 'tourism', 'traveling',
       'school_facilities', 'university_facilities', 'sport_facilities',
       'medical_facilities', 'mouth_and_nose_covering', 'curfew',
       'new_infections'],
      dtype='object')

## Transformation of features

In [8]:
#1
complete_df['cultural_venues'].value_counts(dropna=False)

closed               175
openw/meas           142
openw/lim_100         93
openw/appointment     86
open                  75
closedw/exmz           7
Name: cultural_venues, dtype: int64

In [9]:
#replacing unnecesary and rare values
complete_df['cultural_venues'] = complete_df['cultural_venues'].str.replace('closedw/exmz','closed')

In [10]:
#converting to integers - 5 point scale
complete_df['cultural_venues'] = complete_df['cultural_venues'].str.replace('closed','5')
complete_df['cultural_venues'] = complete_df['cultural_venues'].str.replace('openw/appointment','4')
complete_df['cultural_venues'] = complete_df['cultural_venues'].str.replace('openw/lim_100','3')
complete_df['cultural_venues'] = complete_df['cultural_venues'].str.replace('openw/meas','2')
complete_df['cultural_venues'] = complete_df['cultural_venues'].str.replace('open','1')

In [11]:
#2
complete_df['gastronomy'].value_counts(dropna=False)

closedw/dp         263
openw/meas         228
open                75
openw/tst_vacc       7
outside_imun/dp      5
Name: gastronomy, dtype: int64

In [12]:
#replacing unnecesary and rare values
complete_df['gastronomy'] = complete_df['gastronomy'].str.replace('outside_imun/dp','openw/tst_vacc')

## Nema smisla da gastronomy bude open bez dokaza a essential retail ne, ispravi to u podacima

In [13]:
complete_df['gastronomy'] = complete_df['gastronomy'].str.replace('closedw/dp','4')
complete_df['gastronomy'] = complete_df['gastronomy'].str.replace('openw/tst_vacc','3')
complete_df['gastronomy'] = complete_df['gastronomy'].str.replace('openw/meas','2')
complete_df['gastronomy'] = complete_df['gastronomy'].str.replace('open','1')

In [14]:
#3
complete_df['essential_retail'].value_counts(dropna=False)

openw/meas            355
openw/lim_1p_10sqm     88
open                   75
openw/tst_vacc         60
Name: essential_retail, dtype: int64

In [15]:
complete_df['essential_retail'] = complete_df['essential_retail'].str.replace('openw/tst_vacc','4')
complete_df['essential_retail'] = complete_df['essential_retail'].str.replace('openw/lim_1p_10sqm','3')
complete_df['essential_retail'] = complete_df['essential_retail'].str.replace('openw/meas','2')
complete_df['essential_retail'] = complete_df['essential_retail'].str.replace('open','1')

In [16]:
#4
complete_df['non-essential_retail'].value_counts(dropna=False)

openw/meas                  175
open                         75
closed                       74
openw/lim_ppl                65
closedw/delivery             56
openw/lim_1p_10sqm           44
openw/app_negtst             35
openw/appointment            28
openw/lim_1p_10sqm_esent     26
Name: non-essential_retail, dtype: int64

In [17]:
#replacing unnecesary and rare values
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('openw/lim_1p_10sqm','openw/lim_ppl')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('openw/lim_1p_10sqm_esent','openw/lim_ppl')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('openw/app_negtst','openw/appointment')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('openw/lim_ppl_esent','openw/lim_ppl')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('closedw/delivery','closed')

In [18]:
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('closed','5')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('openw/appointment','4')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('openw/lim_ppl','3')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('openw/meas','2')
complete_df['non-essential_retail'] = complete_df['non-essential_retail'].str.replace('open','1')

In [19]:
#5
complete_df['major_events'].value_counts(dropna=False)

prohibited    443
allowed        75
sport_1000     60
Name: major_events, dtype: int64

In [20]:
complete_df['major_events'] = complete_df['major_events'].str.replace('prohibited','3')
complete_df['major_events'] = complete_df['major_events'].str.replace('sport_1000','2')
complete_df['major_events'] = complete_df['major_events'].str.replace('allowed','1')

In [25]:
#6
complete_df['events_and_gatherings'].value_counts(dropna=False)

5    200
4    156
2     82
1     75
3     65
Name: events_and_gatherings, dtype: int64

In [22]:
#replacing unnecesary and rare values
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('allowed_1000max','allowedw/lim_ppl')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('allowed_500max','allowedw/lim_ppl')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('allowed_250max','allowedw/lim_ppl')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('prohibitedw/ex_trials','prohibited')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('prohibitedw/ex_outdoors','prohibitedw/ex')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('pw/ex_concert_100','prohibitedw/ex')

In [24]:
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('prohibitedw/ex','4')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('prohibited','5')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('allowedw/lim_ppl','3')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('allowedw/meas','2')
complete_df['events_and_gatherings'] = complete_df['events_and_gatherings'].str.replace('allowed','1')

In [13]:
#7
complete_df['public_gatherings'].value_counts(dropna=False)

allowed_2h                 344
allowed                     75
prohibited_over_2           56
allowed_100pplw/tst_imm     53
allowed_10p                 29
allowed_10ppl                9
allowed_10ppl_3h+imm         7
allowed_2h+imun              5
Name: public_gatherings, dtype: int64

In [8]:
#replacing unnecesary and rare values
complete_df['public_gatherings'] = complete_df['public_gatherings'].str.replace('allowed_2h_5ppl','allowed_2h')

In [10]:
complete_df['public_gatherings'] = complete_df['public_gatherings'].str.replace('allowed_2h_1p','allowed_2h')

In [12]:
complete_df['public_gatherings'] = complete_df['public_gatherings'].str.replace('allowed_2h+imun','allowed_2h')