In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
import csv
from linear_model import LeastSquaresBias


In [2]:
def cosine_distance(X1,X2):
    norm1=np.sqrt(np.sum(X1**2,axis=1)) #norm for each data vector:length is # row of X1 n1
    norm2=np.sqrt(np.sum(X2**2,axis=1)) #norm for each data vector: length is # row of X2 n2
    dem=np.outer(norm1,norm2) #n1*n2 matrix
    num=np.dot(X1,X2.T)
    tem=num/dem
    tem[np.isnan(tem)]=0

    return 1-tem

def euclidean_dist_squared(X, Xtest):
    return np.sum(X**2, axis=1)[:,None] + np.sum(Xtest**2, axis=1)[None] - 2 * np.dot(X,Xtest.T)
    
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [6]:
filename = "phase2_training_data.csv"
with open(os.path.join("..","data",filename),"rb") as f:
    df0 = pd.read_csv(f)

df0.head()

Unnamed: 0,country_id,date,cases,deaths,cases_14_100k,cases_100k
0,AD,12/31/2019,0,0,0.00,0.00
1,AD,1/1/2020,0,0,0.00,0.00
2,AD,1/2/2020,0,0,0.00,0.00
3,AD,1/3/2020,0,0,0.00,0.00
4,AD,1/4/2020,0,0,0.00,0.00
...,...,...,...,...,...,...
62997,ZW,10/23/2020,8242,236,1.69,56.28
62998,ZW,10/24/2020,8257,236,1.69,56.38
62999,ZW,10/25/2020,8269,236,1.76,56.46
63000,,,0,0,0.00,0.00


In [8]:
df = df0.pivot_table(index="date",columns='country_id',values=['deaths','cases','cases_14_100k','cases_100k'])
dates = [dt.datetime.strptime(date, "%m/%d/%Y").date() for date in df.index.values]
df = df.iloc[np.argsort(dates),:]
df


Unnamed: 0_level_0,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,...,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths
country_id,AD,AE,AF,AG,AI,AL,AM,AO,AR,AT,...,VC,VE,VG,VI,VN,XK,YE,ZA,ZM,ZW
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
12/31/2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/1/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/2/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/3/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/4/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10/21/2020,3623,117594,40375,119,3,17651,66694,7829,1018986,68414,...,0,747,1,21,35,657,599,18656,346,233
10/22/2020,3811,119132,40510,122,3,17948,68530,8049,1037312,70769,...,0,753,1,21,35,657,599,18741,346,236
10/23/2020,3811,120710,40626,122,3,18250,70836,8338,1053637,73339,...,0,759,1,21,35,657,599,18843,346,236
10/24/2020,4038,122273,40687,122,3,18556,73310,8582,1069355,76781,...,0,764,1,21,35,659,599,18891,346,236


In [9]:
df_deaths = df['deaths']
df_deaths.head()

country_id,AD,AE,AF,AG,AI,AL,AM,AO,AR,AT,...,VC,VE,VG,VI,VN,XK,YE,ZA,ZM,ZW
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12/31/2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/1/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/2/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/3/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/4/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# daily deaths
df_diff0 = df_deaths.diff(axis=0)


In [12]:
df_diff=df_diff0.iloc[200:300,:]
print(df_diff.shape)
euclid_dis = euclidean_dist_squared(np.array(df_diff['CA'])[None], np.array(df_diff).T)
# sorted countries close to Canada in terms of daily deaths
df_diff.columns.values[np.argsort(euclid_dis.flatten())[range(10)]]

(100, 209)


array([&#39;CA&#39;, &#39;PT&#39;, &#39;HU&#39;, &#39;JO&#39;, &#39;DE&#39;, &#39;NP&#39;, &#39;MD&#39;, &#39;BG&#39;, &#39;PY&#39;, &#39;AM&#39;],
      dtype=object)

In [13]:
### Fitting linear regression model after feature selection:

In [14]:
#compute the lag of daily death of canada
daily_death_ca=df_diff0['CA']
daily_death_ca_lag1=daily_death_ca.shift(periods=1)
daily_death_ca_lag2=daily_death_ca.shift(periods=2)
daily_death_ca_lag3=daily_death_ca.shift(periods=3)

feature_space=pd.concat([daily_death_ca,daily_death_ca_lag1,daily_death_ca_lag2,daily_death_ca_lag3],axis=1)
feature_space.columns=["daily_death_ca","daily_death_ca_lag1","daily_death_ca_lag2","daily_death_ca_lag3"]
fs_sub=feature_space.iloc[180:280,:]

print(fs_sub.head())


           daily_death_ca  daily_death_ca_lag1  daily_death_ca_lag2  \
date                                                                  
6/28/2020             8.0                  4.0                 20.0   
6/29/2020             6.0                  8.0                  4.0   
6/30/2020            44.0                  6.0                  8.0   
7/1/2020             25.0                 44.0                  6.0   
7/2/2020              0.0                 25.0                 44.0   

           daily_death_ca_lag3  
date                            
6/28/2020                 30.0  
6/29/2020                 20.0  
6/30/2020                  4.0  
7/1/2020                   8.0  
7/2/2020                   6.0  


In [15]:
model=LeastSquaresBias()
X=feature_space.iloc[200:300,1:4]
y=feature_space.iloc[200:300,0]
model.fit(X=X,y=y)
print(model.w)

[ 3.08587644  0.57279425 -0.21613878  0.38393823]


In [16]:
dat_pred = feature_space
for i in range(11):
    new_data = np.array([dat_pred.iloc[-1,0], dat_pred.iloc[-2,0], dat_pred.iloc[-3,0]])[None]
    print(new_data)
    y_pred = model.predict(X=new_data)
    dat_pred = pd.concat([dat_pred, pd.DataFrame(np.append(y_pred, new_data[0])[None], columns=dat_pred.columns.values)], axis=0)

    


[[34. 26. 33.]]
[[29.61123411 34.         26.        ]]
[[22.68069641 29.61123411 34.        ]]
[[22.73101262 22.68069641 29.61123411]]
[[22.57277641 22.73101262 22.68069641]]
[[19.81036598 22.57277641 22.73101262]]
[[18.28159247 19.81036598 22.57277641]]
[[17.94223089 18.28159247 19.81036598]]
[[17.0176788  17.94223089 18.28159247]]
[[15.97449529 17.0176788  17.94223089]]
[[15.44650345 15.97449529 17.0176788 ]]
