In [18]:
# basics
import numpy as np
import pandas as pd
import datetime
import re

# statistics
import statsmodels.api as sm
import statsmodels.formula.api as smf


# plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = [15,12]

# own data wrappers
from imp import reload
import measureclass as mc; reload(mc);
import coronadataclass as cdc; reload(cdc);

np.seterr(divide = 'ignore');


In [19]:
# load data into datawrapper classes
measure_data = mc.COVID19_measures(download_data = False, measure_level = 2, only_first_dates = True, expand_measure_names = True)
jhu_data     = cdc.CoronaData()

In [20]:
# helper functions

def date2vector(implementdate, start = '22/1/20', end = '31/3/20', shiftdays = 0):
    # generate vector of 0s and 1s when measure is implemented or not
    starttime     = datetime.datetime.strptime(start,         '%d/%m/%y')
    endtime       = datetime.datetime.strptime(end,           '%d/%m/%y')
    implementtime = datetime.datetime.strptime(implementdate, '%d/%m/%Y')
    
    totaldays   = (endtime       - starttime).days
    measuredays = (implementtime - starttime).days
    
    vec         = np.zeros(totaldays)
    vec[min(measuredays+shiftdays,len(vec)-1):] = 1
    
    return vec


def ConvertDateFormat(date):
    m,d,y = date.split('/')
    return '{:02d}/{:02d}/{:02d}'.format(int(d),int(m),int(y))



def LogCaseIncrease(df, key = 'Confirmed', mincases = None):
    cases             = np.array(df[key], dtype = np.int)
    if not mincases is None:
        restrict_date = np.argmin(cases >= mincases)
        cases         = cases[restrict_date:]
    else:
        restrict_date = 0
    ldcases           = np.log(np.diff(cases))

    return restrict_date, ldcases


def CleanUpMeasureName(measurename):
    # regression model formula can't contain special characters
    return 
    return ''.join([mn.capitalize() for mn in measurename.replace(',','').replace('-','').replace('/','').split(' ')])

In [21]:
def GetMeasureIDs(countrylist = None, measure_level = 2, mincount = None):
    if countrylist is None:
        countrylist = measure_data.countrylist # use ALL countries
    
    measurelist = {}
    
    # get all restrictions from countries
    for country in countrylist:
        country_measures = measure_data.CountryData(country, measure_level = 2)
        for measurename, initialdata in country_measures.items():
            if not measurename in measurelist.keys():
                measurelist[measurename] = 0
            measurelist[measurename] += 1
    
    if not mincount is None:
        # rebuild dict with restrictions
        measurelist = {k:v for k,v in measurelist.items() if v >= mincount}

    return measurelist


def GetCountryTrajectories(countrylist = None, data = 'Confirmed', startcases = None):
    if countrylist is None:
        countrylist = jhu_data.countrylist
    
    trajectories = {}
    for country in [c for c in countrylist if c in jhu_data.countrylist]:
        ctraj = np.array(jhu_data.CountryData(country)[data], dtype = np.float)
        starttraj = 0
        if not startcases is None:
            starttraj = np.argmax(ctraj >= startcases)
            ctraj = ctraj[starttraj:]
        trajectories[country] = {}
        trajectories[country]['traj'] = ctraj
        trajectories[country]['startdate'] = ConvertDateFormat(jhu_data.CountryData(country)['Date'][starttraj])
    
    return trajectories

        
def GetRegressionDF(countrylist = None, measure_level = 2, shiftdays = 0, verbose = False):
    
    # get trajectories and measure list for all countries in 'countrylist'
    trajectories         = GetCountryTrajectories(countrylist = countrylist, data = 'Confirmed', startcases = 20)
    measureIDs           = GetMeasureIDs(countrylist = countrylist, measure_level = 2)
    cleaned_measurelist  = [CleanUpMeasureName(mn) for mn in measureIDs.keys()]
    regressionDF         = None
    
    for country in trajectories.keys():
        if country in measure_data.countrylist:
            df_entry_dict               = {}
            df_entry_dict['Country']    = country
            # ********************************************
            # change regressor here:
            df_entry_dict['Observable'] = np.diff(np.log(trajectories[country]['traj']))
            # ********************************************
            if not (np.isnan(df_entry_dict['Observable']).any() or np.isinf(df_entry_dict['Observable']).any()):
                
                if verbose:
                    # print trajectories of individual countries for checking
                    print(country,df_entry_dict['Observable'])

                obslen                      = len(df_entry_dict['Observable'])
                df_entry_dict['Time']       = np.arange(obslen)

                for measurename in measureIDs.keys():
                    implementdate = measure_data.FindMeasure(country, measurename, measure_level)
                    if not implementdate is None:
                        vec = date2vector(implementdate = implementdate, start = trajectories[country]['startdate'], shiftdays = shiftdays)[:obslen]
                    else:
                        vec = np.zeros(obslen)
                    df_entry_dict[CleanUpMeasureName(measurename)] = vec

                if regressionDF is None:
                    regressionDF = pd.DataFrame(df_entry_dict)
                else:
                    regressionDF = regressionDF.append(pd.DataFrame(df_entry_dict))
    
    return regressionDF, cleaned_measurelist


# generate all data into pandas DF for regression
regrDF,ml = GetRegressionDF(countrylist = measure_data.countrylist)

# model formula
formula = 'Observable ~ Time + C(Country) + ' + ' + '.join(ml)
print(formula + '\n')

model   = smf.glm(formula = formula, data = regrDF)
results = model.fit()

print(results.summary())


Observable ~ Time + C(Country) + Border restriction + Cancellation of international flights + Closure of educational institutions + Curfew + Educate and actively communicate with the public + Encourage self-initiated quarantine + Enhance healthcare capacity + Enhance the system + Individual movement restrictions + Mass gathering cancellation + National lockdown + Scale up emergency response mechanisms + Specific health channel for travellers + Stop public transports + Activate communication to the public + Activate emergency response mechanisms + Actively communicate with healthcare professionals + Actively communicate with stakeholders + Airport health check + Airport restriction + Airports closing for domestic flights + Border health check + Call for return of nationals living abroad + Cordon sanitaire/mandatory quarantine + Crisis management plans + Enhance emergency response mechanisms + Enhance laboratory testing + Isolation of cases + Mandatory quarantine of the region arlberg in

SyntaxError: invalid syntax (<unknown>, line 1)

In [9]:
print(results.summary().as_csv())
#print(results.summary(results = 'csv'))

                      OLS Regression Results                     
Dep. Variable:   ,Observable      ,  R-squared:         ,   0.475
Model:           ,OLS             ,  Adj. R-squared:    ,   0.405
Method:          ,Least Squares   ,  F-statistic:       ,   6.789
Date:            ,Thu, 02 Apr 2020,  Prob (F-statistic):,1.16e-61
Time:            ,13:52:24        ,  Log-Likelihood:    ,  711.83
No. Observations:,   927          ,  AIC:               ,  -1204.
Df Residuals:    ,   817          ,  BIC:               ,  -672.2
Df Model:        ,   109          ,                     ,        
Covariance Type: ,nonrobust       ,                     ,        
                                                 ,   coef   , std err ,    t    ,P>|t| ,  [0.025 ,  0.975] 
Intercept                                        ,    0.2331,    0.071,    3.299, 0.001,    0.094,    0.372
C(Country)[T.Austria]                            ,   -0.0200,    0.159,   -0.126, 0.900,   -0.331,    0.291
C(Country)[T.Bel