In [113]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import main
from patsy import dmatrices
import scipy.stats as st

### Data Preprocessing

Data from https://data.europa.eu/euodp/en/data/dataset/covid-19-coronavirus-data/resource/260bbbde-2316-40eb-aec3-7cd7bfc2f590

In [3]:
data = main.getdata('data/timeseries25May.csv')

In [4]:
data = data.loc[data['country'].isin(['Netherlands', 'Belgium', 'Italy', 'Sweden', 'Denmark', 'Norway', 'Spain', 'United_Kingdom', 'Germany', 'Romania'])]

In [5]:
len(data['country'].unique())

10

Check for missing data

In [6]:
datelist = [date for date in pd.date_range(start='2019-12-31', end='2020-05-25') if len(data.loc[data['date'] == date]) != 10]

In [7]:
missing = [[(country, date.ctime()) for country in ['Netherlands', 'Belgium', 'Italy', 'Sweden', 'Denmark', 'Norway', 'Spain', 'United_Kingdom', 'Germany', 'Romania'] if country not in list(data.loc[data['date'] == date]['country'].values)] for date in datelist]

In [8]:
missing

[[('Romania', 'Tue Mar  3 00:00:00 2020')],
 [('Romania', 'Thu Mar  5 00:00:00 2020')],
 [('Spain', 'Mon May 25 00:00:00 2020')]]

Check for negative values

In [9]:
negative = data.loc[data['cases'] < 0]
negative

Unnamed: 0,date,day,month,year,cases,deaths,country,geoId,countryCode,pop,continentExp
16493,2020-04-19,19,4,2020,-713,410,Spain,ES,ESP,46723749.0,Europe
18258,2020-05-21,21,5,2020,-525,363,United_Kingdom,UK,GBR,66488991.0,Europe


In [10]:
data = data.drop(negative.index.values)

In [11]:
data.loc[data['cases'] < 0]

Unnamed: 0,date,day,month,year,cases,deaths,country,geoId,countryCode,pop,continentExp


Separate countries, make sure first record is equal to the first case date and reindex dataframes.

In [12]:
nl=no=be=ge=de=se=ro=it=uk=sp= None
countries = [[nl, 'Netherlands'], [no, 'Norway'], [be, 'Belgium'], [ge, 'Germany'], [de, 'Denmark'], [se, 'Sweden'], [ro, 'Romania'], [it, 'Italy'], [uk, 'United_Kingdom'], [sp, 'Spain']]
for country in countries: 
    country[0] = data.loc[data['country'] == country[1]]
    country[0].drop(country[0].loc[country[0]['cases']==0].index, inplace=True)
    country[0].reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [13]:
dataframes, _ = zip(*countries)

In [14]:
dataframes[0].head()

Unnamed: 0,date,day,month,year,cases,deaths,country,geoId,countryCode,pop,continentExp
0,2020-02-28,28,2,2020,1,0,Netherlands,NL,NLD,17231017.0,Europe
1,2020-02-29,29,2,2020,1,0,Netherlands,NL,NLD,17231017.0,Europe
2,2020-03-01,1,3,2020,5,0,Netherlands,NL,NLD,17231017.0,Europe
3,2020-03-02,2,3,2020,6,0,Netherlands,NL,NLD,17231017.0,Europe
4,2020-03-03,3,3,2020,5,0,Netherlands,NL,NLD,17231017.0,Europe


### Kolmogorov-Smirnov Test

Create two groups, as found in the distance measure procedure, to calculate the criterion D of the Kolmogorov-Smirnov test manually.

In [15]:
socdist = pd.concat((dataframes[0], dataframes[1], dataframes[2], dataframes[4], dataframes[5], dataframes[6]))
lock = pd.concat((dataframes[3], dataframes[7], dataframes[8], dataframes[9]))

In [77]:
meancasessd = [np.mean(socdist.loc[socdist['date'] == date]['cases']) for date in pd.date_range(start='2019-02-01', end='2020-05-25') if not np.isnan(np.mean(socdist.loc[socdist['date'] == date]['cases'])) == True]
meancaseslock = [np.mean(lock.loc[lock['date'] == date]['cases']) for date in pd.date_range(start='2019-01-28', end='2020-05-25') if not np.isnan(np.mean(lock.loc[lock['date'] == date]['cases'])) == True]

In [80]:
meancases = np.concatenate((meancasessd, meancaseslock))

In [88]:
max = meancases.max()
min = meancases.min()
n = len(meancases)

In [106]:
socdist_cdf = [np.round(st.percentileofscore(meancasessd, value)/100, 1) for value in meancases]
lock_cdf = [np.round(st.percentileofscore(meancaseslock, value)/100, 1) for value in meancases]

In [107]:
meancases_diff = np.abs(np.subtract(socdist_cdf, lock_cdf))

The value of D should be bigger than the critical value to find a significant difference between the groups. 

In [108]:
meancases_diff.max()

0.7

In [109]:
d_crit_005 = 1.36*np.sqrt(1/len(meancasessd) + 1/len(meancaseslock))

In [110]:
d_crit_005

0.19352312691562384

The automated procedure confirms the results.

In [139]:
st.ks_2samp(meancasessd, meancaseslock)

Ks_2sampResult(statistic=0.6851851851851852, pvalue=1.4515290418269015e-20)