# Point Pred

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("data/final_df_drop.csv", index_col = 0)
df.sort_values(["Date", "State"], inplace = True)
df.reset_index(drop = True, inplace = True)
df

Unnamed: 0,State,Date,population,confirmed,deaths,incident_rate,mortality_rate,testing_rate,TestsReported
0,Alabama,2020-04-12,4903185.0,3563,93,75.99,2.61,460.30,2165
1,Alaska,2020-04-12,731545.0,272,8,45.50,2.94,1344.71,262
2,Arizona,2020-04-12,7278717.0,3542,115,48.66,3.25,578.52,1119
3,Arkansas,2020-04-12,3017804.0,1280,27,49.44,2.11,761.75,625
4,California,2020-04-12,39512223.0,22795,640,58.14,2.81,485.42,10816
...,...,...,...,...,...,...,...,...,...
7171,Virginia,2020-08-27,8535519.0,116569,2527,1365.69,2.17,17661.98,17503
7172,Washington,2020-08-27,7614893.0,72703,1890,954.75,2.60,18270.89,14511
7173,West Virginia,2020-08-27,1792147.0,9633,198,537.51,2.06,23090.13,5681
7174,Wisconsin,2020-08-27,5822434.0,73138,1111,1256.14,1.52,21065.04,28825


## LR

In [9]:
all_dates = list(df.Date.unique())[2:]
value_pp = []
for date in all_dates:
    X_train = df[(df.Date <= (datetime.strptime(date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d'))].iloc[:, 2:-1]
    y_train = df[(df.Date <= (datetime.strptime(date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d'))].loc[:, "TestsReported"]
    X_test = df[(df.Date == date)].iloc[:, 2:-1]
    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    value_pp = value_pp + list(pred)

copy = df[["State", "Date"]][df.Date >= "2020-04-14"]
copy["Pres"] = value_pp
final_pred = df.copy()
final_pred = final_pred.merge(copy, how = "left", on = ["State", "Date"])
final_pred

Unnamed: 0,State,Date,population,confirmed,deaths,incident_rate,mortality_rate,testing_rate,TestsReported,Pres
0,Alabama,2020-04-12,4903185.0,3563,93,75.99,2.61,460.30,2165,
1,Alaska,2020-04-12,731545.0,272,8,45.50,2.94,1344.71,262,
2,Arizona,2020-04-12,7278717.0,3542,115,48.66,3.25,578.52,1119,
3,Arkansas,2020-04-12,3017804.0,1280,27,49.44,2.11,761.75,625,
4,California,2020-04-12,39512223.0,22795,640,58.14,2.81,485.42,10816,
...,...,...,...,...,...,...,...,...,...,...
7171,Virginia,2020-08-27,8535519.0,116569,2527,1365.69,2.17,17661.98,17503,22682.852198
7172,Washington,2020-08-27,7614893.0,72703,1890,954.75,2.60,18270.89,14511,18987.467254
7173,West Virginia,2020-08-27,1792147.0,9633,198,537.51,2.06,23090.13,5681,10069.425643
7174,Wisconsin,2020-08-27,5822434.0,73138,1111,1256.14,1.52,21065.04,28825,17281.395183


In [10]:
final_pred.to_csv("data/pred_lr.csv")

## Tree

In [14]:
all_dates = list(df.Date.unique())[2:]
value_pp = []
parameters = {'min_samples_split': [2, 3, 4], 'min_samples_leaf':[1, 2, 3, 5]}
for date in all_dates:
    print(date)
    X_train = df[(df.Date <= (datetime.strptime(date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d'))].iloc[:, 2:-1]
    y_train = df[(df.Date <= (datetime.strptime(date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d'))].loc[:, "TestsReported"]
    X_test = df[(df.Date == date)].iloc[:, 2:-1]
    model = DecisionTreeRegressor()
    gs = GridSearchCV(model, parameters)
    gs.fit(X_train, y_train)
    pred = gs.predict(X_test)
    value_pp = value_pp + list(pred)

copy = df[["State", "Date"]][df.Date >= "2020-04-14"]
copy["Pres"] = value_pp
final_pred = df.copy()
final_pred = final_pred.merge(copy, how = "left", on = ["State", "Date"])
final_pred

2020-04-14
2020-04-15
2020-04-16
2020-04-17
2020-04-18
2020-04-19
2020-04-20
2020-04-21
2020-04-22
2020-04-23
2020-04-24
2020-04-25
2020-04-26
2020-04-27
2020-04-28
2020-04-29
2020-04-30
2020-05-01
2020-05-02
2020-05-03
2020-05-04
2020-05-05
2020-05-06
2020-05-07
2020-05-08
2020-05-09
2020-05-10
2020-05-11
2020-05-12
2020-05-13
2020-05-14
2020-05-15
2020-05-16
2020-05-17
2020-05-18
2020-05-19
2020-05-20
2020-05-21
2020-05-22
2020-05-23
2020-05-24
2020-05-25
2020-05-26
2020-05-27
2020-05-28
2020-05-29
2020-05-30
2020-05-31
2020-06-01
2020-06-02
2020-06-03
2020-06-04
2020-06-05
2020-06-06
2020-06-07
2020-06-08
2020-06-09
2020-06-10
2020-06-11
2020-06-12
2020-06-13
2020-06-14
2020-06-15
2020-06-16
2020-06-17
2020-06-18
2020-06-19
2020-06-20
2020-06-21
2020-06-22
2020-06-23
2020-06-24
2020-06-25
2020-06-26
2020-06-27
2020-06-28
2020-06-29
2020-06-30
2020-07-01
2020-07-02
2020-07-03
2020-07-04
2020-07-05
2020-07-06
2020-07-07
2020-07-08
2020-07-09
2020-07-10
2020-07-11
2020-07-12
2020-07-13

Unnamed: 0,State,Date,population,confirmed,deaths,incident_rate,mortality_rate,testing_rate,TestsReported,Pres
0,Alabama,2020-04-12,4903185.0,3563,93,75.99,2.61,460.30,2165,
1,Alaska,2020-04-12,731545.0,272,8,45.50,2.94,1344.71,262,
2,Arizona,2020-04-12,7278717.0,3542,115,48.66,3.25,578.52,1119,
3,Arkansas,2020-04-12,3017804.0,1280,27,49.44,2.11,761.75,625,
4,California,2020-04-12,39512223.0,22795,640,58.14,2.81,485.42,10816,
...,...,...,...,...,...,...,...,...,...,...
7171,Virginia,2020-08-27,8535519.0,116569,2527,1365.69,2.17,17661.98,17503,18176.875000
7172,Washington,2020-08-27,7614893.0,72703,1890,954.75,2.60,18270.89,14511,13020.333333
7173,West Virginia,2020-08-27,1792147.0,9633,198,537.51,2.06,23090.13,5681,4497.000000
7174,Wisconsin,2020-08-27,5822434.0,73138,1111,1256.14,1.52,21065.04,28825,15867.875000


In [15]:
final_pred.to_csv("data/pred_tree.csv")

## RF

In [20]:
all_dates = list(df.Date.unique())[2:]
value_pp = []
parameters = {'n_estimators' : [20, 40], 'min_samples_split': [2, 3, 4], 'min_samples_leaf':[1, 2, 3, 5]}
for date in all_dates:
    print(date)
    X_train = df[(df.Date <= (datetime.strptime(date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d'))].iloc[:, 2:-1]
    y_train = df[(df.Date <= (datetime.strptime(date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d'))].loc[:, "TestsReported"]
    X_test = df[(df.Date == date)].iloc[:, 2:-1]
    model = RandomForestRegressor()
    gs = GridSearchCV(model, parameters)
    gs.fit(X_train, y_train)
    pred = gs.predict(X_test)
    value_pp = value_pp + list(pred)

copy = df[["State", "Date"]][df.Date >= "2020-04-14"]
copy["Pres"] = value_pp
final_pred = df.copy()
final_pred = final_pred.merge(copy, how = "left", on = ["State", "Date"])
final_pred

2020-04-14
2020-04-15
2020-04-16
2020-04-17
2020-04-18
2020-04-19
2020-04-20
2020-04-21
2020-04-22
2020-04-23
2020-04-24
2020-04-25
2020-04-26
2020-04-27
2020-04-28
2020-04-29
2020-04-30
2020-05-01
2020-05-02
2020-05-03
2020-05-04
2020-05-05
2020-05-06
2020-05-07
2020-05-08
2020-05-09
2020-05-10
2020-05-11
2020-05-12
2020-05-13
2020-05-14
2020-05-15
2020-05-16
2020-05-17
2020-05-18
2020-05-19
2020-05-20
2020-05-21
2020-05-22
2020-05-23
2020-05-24
2020-05-25
2020-05-26
2020-05-27
2020-05-28
2020-05-29
2020-05-30
2020-05-31
2020-06-01
2020-06-02
2020-06-03
2020-06-04
2020-06-05
2020-06-06
2020-06-07
2020-06-08
2020-06-09
2020-06-10
2020-06-11
2020-06-12
2020-06-13
2020-06-14
2020-06-15
2020-06-16
2020-06-17
2020-06-18
2020-06-19
2020-06-20
2020-06-21
2020-06-22
2020-06-23
2020-06-24
2020-06-25
2020-06-26
2020-06-27
2020-06-28
2020-06-29
2020-06-30
2020-07-01
2020-07-02
2020-07-03
2020-07-04
2020-07-05
2020-07-06
2020-07-07
2020-07-08
2020-07-09
2020-07-10
2020-07-11
2020-07-12
2020-07-13

Unnamed: 0,State,Date,population,confirmed,deaths,incident_rate,mortality_rate,testing_rate,TestsReported,Pres
0,Alabama,2020-04-12,4903185.0,3563,93,75.99,2.61,460.30,2165,
1,Alaska,2020-04-12,731545.0,272,8,45.50,2.94,1344.71,262,
2,Arizona,2020-04-12,7278717.0,3542,115,48.66,3.25,578.52,1119,
3,Arkansas,2020-04-12,3017804.0,1280,27,49.44,2.11,761.75,625,
4,California,2020-04-12,39512223.0,22795,640,58.14,2.81,485.42,10816,
...,...,...,...,...,...,...,...,...,...,...
7171,Virginia,2020-08-27,8535519.0,116569,2527,1365.69,2.17,17661.98,17503,18015.494668
7172,Washington,2020-08-27,7614893.0,72703,1890,954.75,2.60,18270.89,14511,14883.548948
7173,West Virginia,2020-08-27,1792147.0,9633,198,537.51,2.06,23090.13,5681,5081.732487
7174,Wisconsin,2020-08-27,5822434.0,73138,1111,1256.14,1.52,21065.04,28825,20240.256667


In [21]:
final_pred.to_csv("data/pred_rf.csv")

# Temporal Prescriptive

In [31]:
pred_temp = pd.read_csv("data/pred_temporal.csv")
pred_temp

Unnamed: 0.1,Unnamed: 0,State,Date,population,confirmed,deaths,incident_rate,mortality_rate,testing_rate,TestsReported,Neighbors,Value
0,0,Alabama,2020-04-12,0.111072,0.004762,0.002825,0.014638,0.267967,0.008417,2165,,
1,1,Alaska,2020-04-12,0.003924,0.000003,0.000243,0.004809,0.301848,0.027350,262,,
2,2,Arizona,2020-04-12,0.172087,0.004731,0.003493,0.005828,0.333676,0.010948,1119,,
3,3,Arkansas,2020-04-12,0.062646,0.001460,0.000820,0.006080,0.216632,0.014871,625,,
4,4,California,2020-04-12,1.000000,0.032572,0.019438,0.008884,0.288501,0.008955,10816,,
...,...,...,...,...,...,...,...,...,...,...,...,...
7171,7171,Virginia,2020-08-27,0.204368,0.168171,0.076748,0.430374,0.222793,0.376656,17503,"[[30, 21, 18, 38], [82, 73, 70, 90, 92], [134,...","[[222, 7221, 1914, 5495], [502, 5881, 4748, 33..."
7172,7172,Washington,2020-08-27,0.180722,0.104740,0.057401,0.297907,0.266940,0.389691,14511,"[[30, 21, 18, 40], [82, 73, 70, 92, 72], [134,...","[[222, 7221, 1914, 1059], [502, 5881, 4748, 46..."
7173,7173,West Virginia,2020-08-27,0.031166,0.013539,0.006013,0.163409,0.211499,0.492857,5681,"[[40, 8, 31, 34], [92, 60, 83, 59, 86], [144, ...","[[1059, 547, 1921, 218], [467, 391, 853, 767, ..."
7174,7174,Wisconsin,2020-08-27,0.134683,0.105369,0.033742,0.395060,0.156057,0.449505,28825,"[[30, 21, 18, 40], [82, 73, 92, 70, 60], [125,...","[[222, 7221, 1914, 1059], [502, 5881, 467, 474..."


In [32]:
def exponential_decay_weights(t, decay_rate):
    res = [np.exp(-decay_rate * x) for x in t]
    somme = sum(res)
    return [x/somme for x in res]

In [None]:
values = pred_temp[~pred_temp.Value.isna()].Value
for x in values :
    n = len(x)
    weights = exponential_decay_weights()
    
    

In [None]:
pred =