In [1]:
import pandas as pd
import numpy as np

In [2]:
pd_JH_data = pd.read_csv('../data/processed/COVID_relational_confirmed.csv',sep=';', parse_dates=[0])
pd_JH_data = pd_JH_data.sort_values('date', ascending=True).reset_index(drop=True).copy()
pd_JH_data

Unnamed: 0,date,state,country,confirmed
0,2020-01-22,Alberta,Canada,0.0
1,2020-01-22,no,"Korea, South",1.0
2,2020-01-22,no,Kosovo,0.0
3,2020-01-22,no,Kuwait,0.0
4,2020-01-22,no,Kyrgyzstan,0.0
...,...,...,...,...
62239,2020-09-11,no,Barbados,180.0
62240,2020-09-11,no,Belarus,73784.0
62241,2020-09-11,no,Belgium,91537.0
62242,2020-09-11,no,Albania,11021.0


In [3]:
test_data = pd_JH_data[((pd_JH_data['country']=='US')|
                      (pd_JH_data['country']=='Germany'))&
                       (pd_JH_data['date']> '2020-03-20')]

In [4]:
test_data

Unnamed: 0,date,state,country,confirmed
15737,2020-03-21,no,Germany,22213.0
15776,2020-03-21,no,US,24508.0
16002,2020-03-22,no,Germany,24873.0
16041,2020-03-22,no,US,33152.0
16269,2020-03-23,no,Germany,29056.0
...,...,...,...,...
61527,2020-09-09,no,US,6360212.0
61755,2020-09-10,no,Germany,258149.0
61796,2020-09-10,no,US,6396100.0
62020,2020-09-11,no,Germany,259735.0


In [5]:
test_data.groupby(['country']).agg(np.max)

Unnamed: 0_level_0,date,state,confirmed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Germany,2020-09-11,no,259735.0
US,2020-09-11,no,6443743.0


In [10]:
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

def savgol_filter(df_input,column='confirmed',window=5):

    window=5,
    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0)

    result=signal.savgol_filter(np.array(filter_in),
                           5,
                           1)
    df_result[column+'_filtered']=result
    return df_result

def rolling_reg(df_input,col='confirmed'):

    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

def calc_filtered_data(df_input,filter_on='confirmed'):

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'
    pd_filtered_result=df_input[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter).reset_index()

    df_output=pd.merge(df_input,pd_filtered_result[['index', filter_on+'_filtered']],on = ['index'], how='left')


    return df_output

def calc_doubling_rate(df_input,filter_on='confirmed'):
    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    df_output=pd.merge(df_input,pd_DR_result[['index', filter_on+'_DR']], on=['index'],how='left')
    return df_output

if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))
    pd_JH_data=pd.read_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data= pd_JH_data.sort_values('date', ascending=True).reset_index().copy()

    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')
    print(pd_result_larg.head())


the test slope is: [2.]
   index       date    state       country  confirmed  confirmed_filtered  \
0      0 2020-01-22  Alberta        Canada        0.0                 0.0   
1    169 2020-01-22       no  Korea, South        1.0                -4.8   
2    170 2020-01-22       no        Kosovo        0.0                 0.0   
3    171 2020-01-22       no        Kuwait        0.0                 0.0   
4    172 2020-01-22       no    Kyrgyzstan        0.0                10.8   

   confirmed_DR  confirmed_filtered_DR  
0           NaN                    NaN  
1           NaN                    NaN  
2           NaN                    NaN  
3           NaN                    NaN  
4           NaN                    NaN  


In [12]:
test_data.groupby(['state','country']).agg(np.max)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,confirmed
state,country,Unnamed: 2_level_1,Unnamed: 3_level_1
no,Germany,2020-09-11,259735.0
no,US,2020-09-11,6443743.0


In [13]:
def rolling_reg(df_input,col='confirmed'):
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

In [14]:
test_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_reg,'confirmed')

state  country       
no     Germany  15737           NaN
                16002           NaN
                16269      7.417994
                16535      7.142035
                16800      8.012983
                            ...    
       US       60995    225.472064
                61264    248.848116
                61527    212.427622
                61796    184.137066
                62062    153.236962
Name: confirmed, Length: 350, dtype: float64

In [15]:
pd_DR_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_reg,'confirmed').reset_index()

In [16]:
pd_DR_result=pd_DR_result.rename(columns={'confirmed':'confirmed_DR',
                             'level_2':'index'})
pd_DR_result.head()

Unnamed: 0,state,country,index,confirmed_DR
0,Alberta,Canada,0,
1,Alberta,Canada,465,
2,Alberta,Canada,701,
3,Alberta,Canada,966,
4,Alberta,Canada,1263,


In [17]:
pd_JH_data=pd_JH_data.reset_index()
pd_JH_data.head()

Unnamed: 0,level_0,index,date,state,country,confirmed
0,0,0,2020-01-22,Alberta,Canada,0.0
1,1,169,2020-01-22,no,"Korea, South",1.0
2,2,170,2020-01-22,no,Kosovo,0.0
3,3,171,2020-01-22,no,Kuwait,0.0
4,4,172,2020-01-22,no,Kyrgyzstan,0.0


In [18]:
pd_result_larg=pd.merge(pd_JH_data,pd_DR_result[['index','confirmed_DR']],on=['index'],how='left')
pd_result_larg.head()

Unnamed: 0,level_0,index,date,state,country,confirmed,confirmed_DR
0,0,0,2020-01-22,Alberta,Canada,0.0,
1,1,169,2020-01-22,no,"Korea, South",1.0,
2,2,170,2020-01-22,no,Kosovo,0.0,
3,3,171,2020-01-22,no,Kuwait,0.0,
4,4,172,2020-01-22,no,Kyrgyzstan,0.0,


In [19]:
#pd_result_larg[pd_result_larg['country']=='Germany']

# Filtering the data with groupby apply

In [20]:
from scipy import signal

def savgol_filter(df_input,column='confirmed',window=5):
   
    window=5, 
    degree=1
    df_result=df_input
    
    filter_in=df_input[column].fillna(0)
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, 
                           1)
    df_result[column+'_filtered']=result
    return df_result
        

In [21]:
pd_filtered_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(savgol_filter).reset_index()

In [22]:
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_result[['index','confirmed_filtered']],on=['index'],how='left')
pd_result_larg.head()

Unnamed: 0,level_0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered
0,0,0,2020-01-22,Alberta,Canada,0.0,,0.0
1,1,169,2020-01-22,no,"Korea, South",1.0,,-4.8
2,2,170,2020-01-22,no,Kosovo,0.0,,0.0
3,3,171,2020-01-22,no,Kuwait,0.0,,0.0
4,4,172,2020-01-22,no,Kyrgyzstan,0.0,,10.8


# Filtered Doubling rate

In [24]:
pd_filtered_doubling=pd_result_larg[['state','country','confirmed_filtered']].groupby(['state','country']).apply(rolling_reg,'confirmed_filtered').reset_index()

pd_filtered_doubling=pd_filtered_doubling.rename(columns={'confirmed_filtered':'confirmed_filtered_DR',
                             'level_2':'index'})

pd_filtered_doubling.head()

Unnamed: 0,state,country,index,confirmed_filtered_DR
0,Alberta,Canada,0,
1,Alberta,Canada,465,
2,Alberta,Canada,701,
3,Alberta,Canada,966,
4,Alberta,Canada,1263,


In [25]:
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_doubling[['index','confirmed_filtered_DR']],on=['index'],how='left')
pd_result_larg.tail()

Unnamed: 0,level_0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
62239,62239,62072,2020-09-11,no,Barbados,180.0,inf,10.0,-1.688215
62240,62240,62073,2020-09-11,no,Belarus,73784.0,1664.0,4994.6,0.870128
62241,62241,62074,2020-09-11,no,Belgium,91537.0,22.815752,2836.4,1.438172
62242,62242,62060,2020-09-11,no,Albania,11021.0,13.901821,6166.8,0.68902
62243,62243,62243,2020-09-11,no,Zimbabwe,7479.0,298.146667,7494.8,6.649687


In [27]:
mask=pd_result_larg['confirmed']>100
pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN) 

In [28]:
pd_result_larg[pd_result_larg['country']=='Germany'].tail()

Unnamed: 0,level_0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
60955,60955,61058,2020-09-07,no,Germany,253626.0,inf,13.0,0.76524
61223,61223,61324,2020-09-08,no,Germany,254957.0,9.042357,866.6,130.656295
61487,61487,61590,2020-09-09,no,Germany,256433.0,inf,13.0,91.949493
61755,61755,61856,2020-09-10,no,Germany,258149.0,inf,939.0,111.433257
62020,62020,62122,2020-09-11,no,Germany,259735.0,inf,975.2,6.321746


In [29]:
pd_result_larg.head()

Unnamed: 0,level_0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
0,0,0,2020-01-22,Alberta,Canada,0.0,,0.0,
1,1,169,2020-01-22,no,"Korea, South",1.0,,-4.8,
2,2,170,2020-01-22,no,Kosovo,0.0,,0.0,
3,3,171,2020-01-22,no,Kuwait,0.0,,0.0,
4,4,172,2020-01-22,no,Kyrgyzstan,0.0,,10.8,


In [31]:
pd_result_larg.to_csv('../data/processed/COVID_final_set.csv',sep=';',index=False)