# Train with multiple models then compare the results. Use the best scored model for each county.

* Tensorflow LSTM
* Tensorflow Dense
* Linear Regression
* XGBRegression
* Same value

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBRegressor

import multiprocessing as mp
from functools import partial
from helper_functions import *

## Load data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv').sort_values(by=['cfips', 'first_day_of_month']).reset_index().drop('index', axis=1)
results = pd.read_csv('lstm_sv_result_with_category.csv')
d = pd.read_csv('add_XGBR_results.csv')

In [3]:
test

Unnamed: 0,row_id,cfips,first_day_of_month
0,1001_2022-11-01,1001,2022-11-01
1,1001_2022-12-01,1001,2022-12-01
2,1001_2023-01-01,1001,2023-01-01
3,1001_2023-02-01,1001,2023-02-01
4,1001_2023-03-01,1001,2023-03-01
...,...,...,...
25075,56045_2023-02-01,56045,2023-02-01
25076,56045_2023-03-01,56045,2023-03-01
25077,56045_2023-04-01,56045,2023-04-01
25078,56045_2023-05-01,56045,2023-05-01


In [4]:
results

Unnamed: 0,Country,LSTM_W9_H8,SV_MAPE,Active,category
0,1001,0.732748,1.835232,1417,lstm
1,1003,0.747286,5.230218,13401,lstm
2,1005,1.273616,1.101416,239,SV
3,1007,9.315642,3.574082,220,SV
4,1009,0.766072,2.116541,789,lstm
...,...,...,...,...,...
3130,56037,2.913013,2.450784,927,SV
3131,56039,0.354971,1.586990,4875,lstm
3132,56041,4.173564,1.381605,574,SV
3133,56043,0.819441,3.608545,183,lstm


In [5]:
result_df = pd.merge(results, d, how='inner', on='Country')
result_df = result_df[['Country', 'LSTM_W9_H8', 'SV_MAPE_x', 'LR_MAPE', 'XGBR_MAPE']]
result_df

Unnamed: 0,Country,LSTM_W9_H8,SV_MAPE_x,LR_MAPE,XGBR_MAPE
0,1001,0.732748,1.835232,0.992271,2.257356
1,1003,0.747286,5.230218,4.317589,93.943110
2,1005,1.273616,1.101416,1.833943,0.912357
3,1007,9.315642,3.574082,4.259990,4.514549
4,1009,0.766072,2.116541,1.285277,2.398140
...,...,...,...,...,...
3130,56037,2.913013,2.450784,3.550418,3.110915
3131,56039,0.354971,1.586990,2.390681,98.065660
3132,56041,4.173564,1.381605,3.925458,87.435030
3133,56043,0.819441,3.608545,2.219479,3.603001


## Dense with multiple variable

In [6]:
tf_dense_results = d[['Country', 'tf_dense_MAPE']]
tf_dense_results.head()

Unnamed: 0,Country,tf_dense_MAPE
0,1001,1.504614
1,1003,4.143341
2,1005,1.395184
3,1007,5.228748
4,1009,1.442244


In [12]:
%%time
if __name__ == '__main__':
    HORIZON = 8
    WINDOW_SIZE = 9

    train = pd.read_csv('train.csv')
    IDS = train.cfips.unique()
    #random_ids = IDS[:10]
    
    window_size = WINDOW_SIZE
    horizon = HORIZON
    epoch = 500
    split_size = 6
    
    cfips_list = []
    mapes = []
    density_values = []
    active_size = []
    forecasts = []

    pool = mp.Pool(16)

    func = partial(train_get_results_multi_variables_dense, train, window_size, horizon, epoch, split_size)
    results = pool.map(func, IDS)
    pool.close()
    pool.join()
    
    # Create DataFrame
    for i in range(len(results)):
        c = results[i][0]
        cfips_list.append(c)
        last_density = results[i][1]
        density_values.append(last_density)
        last_active = results[i][2]
        active_size.append(last_active)
        mape = results[i][3]
        mapes.append(mape)
        preds = results[i][4]
        x = np.array(preds).reshape((-1))
        preds = x.astype(np.float64)
        forecasts.append(preds)
    
    dense_result_df = pd.DataFrame({'Country': cfips_list,
                                   'MAPE': mapes,
                                   "Density": density_values,
                                   'Active': active_size,
                                   'forecast': forecasts})

CPU times: total: 6.08 s
Wall time: 17min 41s


In [13]:
dense_result_df.to_csv("dense_W9_H8_results.csv", index=False)

### Compare with old tf_dense_result and new one with W9_H8 multivariable set
### Add Dense result to result_df.csv

so I will have the results on tf_LSTM, tf_Dense, LR, XGBR, and SV. I will categorize the model then create the function to use the model on each County. 

In [19]:
tf_dense_results['dense_W9_H8_MV'] = dense_result_df['MAPE']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tf_dense_results['dense_W9_H8_MV'] = dense_result_df['MAPE']


In [21]:
all_results = pd.merge(result_df, tf_dense_results, how='inner', on='Country')
all_results

Unnamed: 0,Country,LSTM_W9_H8,SV_MAPE_x,LR_MAPE,XGBR_MAPE,tf_dense_MAPE,dense_W9_H8_MV
0,1001,0.732748,1.835232,0.992271,2.257356,1.504614,0.724775
1,1003,0.747286,5.230218,4.317589,93.943110,4.143341,2.480568
2,1005,1.273616,1.101416,1.833943,0.912357,1.395184,4.606141
3,1007,9.315642,3.574082,4.259990,4.514549,5.228748,24.276476
4,1009,0.766072,2.116541,1.285277,2.398140,1.442244,1.445454
...,...,...,...,...,...,...,...
3130,56037,2.913013,2.450784,3.550418,3.110915,8.937141,9.501690
3131,56039,0.354971,1.586990,2.390681,98.065660,3.437833,3.563838
3132,56041,4.173564,1.381605,3.925458,87.435030,4.339747,9.976927
3133,56043,0.819441,3.608545,2.219479,3.603001,2.373714,6.841048


In [22]:
all_results.describe()

Unnamed: 0,Country,LSTM_W9_H8,SV_MAPE_x,LR_MAPE,XGBR_MAPE,tf_dense_MAPE,dense_W9_H8_MV
count,3135.0,3135.0,3135.0,3135.0,3135.0,3135.0,3135.0
mean,30376.03764,57.733185,3.909992,8927.551,116.271379,27.996707,15092.24
std,15145.862593,3113.792218,21.621562,499585.8,4497.496999,1269.835687,844604.5
min,1001.0,0.164285,0.0,0.2424637,0.020653,0.175896,0.221154
25%,18178.0,0.854867,1.394263,1.664201,2.300055,1.800255,2.73861
50%,29173.0,1.365231,2.353172,2.74895,5.577039,3.101383,4.995527
75%,45076.0,2.304901,3.993313,4.815689,89.062245,5.295601,8.907001
max,56045.0,174346.52,1183.2648,27972350.0,251844.53,71091.58,47289580.0


### Label categories

In [23]:
best_score = all_results.min(axis='columns')

cat = []
for i, m in enumerate(best_score):
    if m == all_results['LSTM_W9_H8'].iloc[i]:
        category = 'LSTM'
    elif m == all_results['SV_MAPE_x'].iloc[i]:
        category = 'SV'
    elif m == all_results['LR_MAPE'].iloc[i]:
        category = 'LR'
    elif m == all_results['XGBR_MAPE'].iloc[i]:
        category = 'XGBR'
    elif m == all_results['tf_dense_MAPE'].iloc[i]:
        category = 'Dense'
    elif m == all_results['dense_W9_H8_MV'].iloc[i]:
        category = 'Dense_MV'
    cat.append(category)
all_results['category'] = cat

In [25]:
all_results.to_csv('all_results_with_category.csv', index=False)

In [26]:
all_results['category'].value_counts()

LSTM        1878
SV           470
LR           293
Dense        242
XGBR         134
Dense_MV     118
Name: category, dtype: int64