In [471]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
import csv
from linear_model import LeastSquaresBias


In [472]:
def cosine_distance(X1,X2):
    norm1=np.sqrt(np.sum(X1**2,axis=1)) #norm for each data vector:length is # row of X1 n1
    norm2=np.sqrt(np.sum(X2**2,axis=1)) #norm for each data vector: length is # row of X2 n2
    dem=np.outer(norm1,norm2) #n1*n2 matrix
    num=np.dot(X1,X2.T)
    tem=num/dem
    tem[np.isnan(tem)]=0

    return 1-tem

def euclidean_dist_squared(X, Xtest):
    return np.sum(X**2, axis=1)[:,None] + np.sum(Xtest**2, axis=1)[None] - 2 * np.dot(X,Xtest.T)
    
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [473]:
filename = "phase1_training_data.csv"
with open(os.path.join("..","data",filename),"rb") as f:
    df0 = pd.read_csv(f)

df0.head()

Unnamed: 0,country_id,date,cases,deaths,cases_14_100k,cases_100k
0,AD,12/31/2019,0,0,0.0,0.0
1,AD,1/1/2020,0,0,0.0,0.0
2,AD,1/2/2020,0,0,0.0,0.0
3,AD,1/3/2020,0,0,0.0,0.0
4,AD,1/4/2020,0,0,0.0,0.0


In [474]:
df = df0.pivot_table(index="date",columns='country_id',values=['deaths','cases','cases_14_100k','cases_100k'])
dates = [dt.datetime.strptime(date, "%m/%d/%Y").date() for date in df.index.values]
df = df.iloc[np.argsort(dates),:]
df.head()


Unnamed: 0_level_0,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,...,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths
country_id,AD,AE,AF,AG,AI,AL,AM,AO,AR,AT,...,VC,VE,VG,VI,VN,XK,YE,ZA,ZM,ZW
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
12/31/2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/1/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/2/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/3/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/4/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [476]:
df_deaths = df['deaths']
df_deaths.head()

country_id,AD,AE,AF,AG,AI,AL,AM,AO,AR,AT,...,VC,VE,VG,VI,VN,XK,YE,ZA,ZM,ZW
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12/31/2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/1/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/2/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/3/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1/4/2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [477]:
# daily deaths
df_diff0 = df_deaths.diff(axis=0)


In [486]:
df_diff=df_diff0.iloc[180:280,:]
print(df_diff.shape)
euclid_dis = euclidean_dist_squared(np.array(df_diff['CA'])[None], np.array(df_diff).T)
# sorted countries close to Canada in terms of daily deaths
df_diff.columns.values[np.argsort(euclid_dis.flatten())[range(10)]]

(100, 208)


array([&#39;CA&#39;, &#39;DE&#39;, &#39;PL&#39;, &#39;DZ&#39;, &#39;BE&#39;, &#39;AM&#39;, &#39;MD&#39;, &#39;PT&#39;, &#39;MK&#39;, &#39;BY&#39;],
      dtype=object)

In [None]:
### Fitting linear regression model after feature selection:

In [487]:
#compute the lag of daily death of canada
daily_death_ca=df_diff0['CA']
daily_death_ca_lag1=daily_death_ca.shift(periods=1)
daily_death_ca_lag2=daily_death_ca.shift(periods=2)
daily_death_ca_lag3=daily_death_ca.shift(periods=3)

feature_space=pd.concat([daily_death_ca,daily_death_ca_lag1,daily_death_ca_lag2,daily_death_ca_lag3],axis=1)
feature_space.columns=["daily_death_ca","daily_death_ca_lag1","daily_death_ca_lag2","daily_death_ca_lag3"]
fs_sub=feature_space.iloc[180:280,:]

print(fs_sub.head())


           daily_death_ca  daily_death_ca_lag1  daily_death_ca_lag2  \
date                                                                  
6/28/2020             8.0                  4.0                 20.0   
6/29/2020             6.0                  8.0                  4.0   
6/30/2020            44.0                  6.0                  8.0   
7/1/2020             25.0                 44.0                  6.0   
7/2/2020              0.0                 25.0                 44.0   

           daily_death_ca_lag3  
date                            
6/28/2020                 30.0  
6/29/2020                 20.0  
6/30/2020                  4.0  
7/1/2020                   8.0  
7/2/2020                   6.0  


In [488]:
model=LeastSquaresBias()
X=feature_space.iloc[180:280,1:4]
y=feature_space.iloc[180:280,0]
model.fit(X=X,y=y)
print(model.w)

[ 3.08624767  0.46034715 -0.12275157  0.38948886]


In [489]:
dat_pred = feature_space
for i in range(11):
    new_data = np.array([dat_pred.iloc[-1,0], dat_pred.iloc[-2,0], dat_pred.iloc[-3,0]])[None]
    print(new_data)
    y_pred = model.predict(X=new_data)
    dat_pred = pd.concat([dat_pred, pd.DataFrame(np.append(y_pred, new_data[0])[None], columns=dat_pred.columns.values)], axis=0)

    


[[19. 53. 90.]]
[[40.38100793 19.         53.        ]]
[[39.98615935 40.38100793 19.        ]]
[[23.93721845 39.98615935 40.38100793]]
[[24.92526699 23.93721845 39.98615935]]
[[27.19635583 24.92526699 23.93721845]]
[[21.86967688 27.19635583 24.92526699]]
[[19.52360961 21.86967688 27.19635583]]
[[19.98202622 19.52360961 21.86967688]]
[[18.40635831 19.98202622 19.52360961]]
[[16.71096566 18.40635831 19.98202622]]


In [490]:
true_deaths_CA = np.cumsum(np.append(dat_pred.iloc[1:280,0], np.array([26,11,16,28,23,5,14,27,10,35,23])))
pred_deaths_CA = np.cumsum(dat_pred.iloc[1:,0])
print(rmse(true_deaths_CA[-11:],pred_deaths_CA[-11:]))


57.66428703559427


In [492]:


#compute the lag of daily death of canada
death_ca=df_deaths['CA']
print(death_ca)
death_ca_lag1=death_ca.shift(periods=1)
death_ca_lag2=death_ca.shift(periods=2)
death_ca_lag3=death_ca.shift(periods=3)

feature_space=pd.concat([death_ca,death_ca_lag1,death_ca_lag2,death_ca_lag3],axis=1)
feature_space.columns=["death_ca","death_ca_lag1","death_ca_lag2","death_ca_lag3"]

model=LeastSquaresBias()
X=feature_space.iloc[180:280,1:4]
y=feature_space.iloc[180:280,0]
model.fit(X=X,y=y)

dat_pred = feature_space
for i in range(11):
    new_data = np.array([dat_pred.iloc[-1,0], dat_pred.iloc[-2,0], dat_pred.iloc[-3,0]])[None]
    y_pred = model.predict(X=new_data)
    dat_pred = pd.concat([dat_pred, pd.DataFrame(np.append(y_pred, new_data[0])[None], columns=dat_pred.columns.values)], axis=0)

pred_deaths_CA2 = dat_pred.iloc[:,0]

print(rmse(true_deaths_CA[-11:],(pred_deaths_CA2[-11:]+pred_deaths_CA[-11:])/2))

date
12/31/2019       0
1/1/2020         0
1/2/2020         0
1/3/2020         0
1/4/2020         0
              ... 
10/1/2020     9297
10/2/2020     9319
10/3/2020     9409
10/4/2020     9462
10/5/2020     9481
Name: CA, Length: 280, dtype: int64
17.73958226350398


In [494]:
prediction = (pred_deaths_CA2[-11:]+pred_deaths_CA[-11:])/2
prediction.to_csv("../data/prediction.csv", index = False, sep = ",")