In [1]:
import sys
sys.path.append('./geolytics/')
from geolytics_analysis.data_simulation import DataSimulation
from geolytics_analysis import paper_models

# Data simulation

#### This portion of code generates data as described in the article. choosing a high number of sections (p) and a high number of days (n_days) may cause performance issues. the default parameters p=500 and n_days=100 takes several hours to fininsh.

In [2]:
data_sim =  DataSimulation(p=200,n_days=100,t_switch=8)
full_days_data=data_sim.generate_data()
sim_train_df,sim_train_intercept,sim_test_df,sim_test_intercept= data_sim.split_center_data()

# OLS

In [3]:
Ols = paper_models.Ols(sim_train_df,sim_test_df,train_intercept=sim_train_intercept,test_intercept=sim_test_intercept)
Ols.train()
Ols.predict()

Validation MSE: 12.028891306332804
Validation MAE: 2.4842002991591654
Test MSE: 12.49015823546652
Test MAE: 2.5056124230661503


(12.49015823546652, 2.5056124230661503)

# Lasso

In [4]:
Lasso = paper_models.Lasso(sim_train_df,sim_test_df,train_intercept=sim_train_intercept,test_intercept=sim_test_intercept)
Lasso.train()
Lasso.predict()

Validation MSE: 11.454699705588158
Validation MAE: 2.4206663358732046
Test MSE: 11.785718010919553
Test MAE: 2.441743844452976


(11.785718010919553, 2.441743844452976)

# Ridge

In [5]:
RidgeCV = paper_models.RidgeCV(sim_train_df,sim_test_df,train_intercept=sim_train_intercept,test_intercept=sim_test_intercept)
RidgeCV.train()
RidgeCV.predict()

Validation MSE: 11.932471185749637
Validation MAE: 2.5032273584640587
Test MSE: 12.262323154662953
Test MAE: 2.5210037571309867


# ElasticNet

In [6]:
ElasticNet = paper_models.ElasticNet(sim_train_df,sim_test_df,train_intercept=sim_train_intercept,test_intercept=sim_test_intercept)
ElasticNet.train()
ElasticNet.predict()

Validation MSE: 11.566268429501916
Validation MAE: 2.4405663029940357
Test MSE: 11.891430092833856
Test MAE: 2.457627728131975


# TSLasso

In [None]:
TSLasso = paper_models.TSLasso(sim_train_df,sim_test_df,train_intercept=sim_train_intercept,test_intercept=sim_test_intercept)
TSLasso.train()
TSLasso.predict()

subset 0
subset 1
subset 2
subset 3
subset 4
subset 5
subset 6
subset 7
subset 8
subset 9

# RSLasso


In [None]:
RSLasso = paper_models.RSLasso(sim_train_df,sim_test_df,train_intercept=sim_train_intercept,test_intercept=sim_test_intercept)
RSLasso.train()
RSLasso.predict()

# Visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from  geolytics_analysis import model_comparator
from  geolytics_analysis import models
sns.set()

In [None]:
models_predictions={}
models_predictions['Lasso'] = Lasso.test_prediction
models_predictions['Ols'] = Ols.test_prediction
models_predictions['TSLasso'] = TSLasso.test_prediction
models_predictions['RSLasso'] = RSLasso.test_prediction
models_predictions['ElasticNet'] = ElasticNet.test_prediction
models_predictions['RidgeCV'] = RidgeCV.test_prediction

input_lag = 5
output_lag=1
true_model=models.DataModel(sim_test_df,input_lag,output_lag,20,valid_split=1)
true_model.preprocessData()
true_values=true_model.restorePredictionsAsDF(true_model.trainSplit()[1],'train')
mc=model_comparator.ModelCompare(models_predictions.copy(),
                                 true_values=true_values)

In [None]:
res_table=mc.comparisonTable().round(2)
res_table=res_table.sort_index()
res_table

In [None]:



plt.figure(figsize=(8,4))
mc.fontsize=20
mc.futurError()
plt.legend(ncol=4,loc=(0.0,1.01),fontsize=13)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15);


In [None]:
plt.figure(figsize=(8,4))
mc.plotTimeError()
plt.legend(ncol=4,loc=(0.0,1.01),fontsize=13)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15);