In [1]:

import numpy as np
import pandas as pd
import math
import statistics
import random
import importlib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from utils import math_expressions as mexpr, methods

In [2]:
train_df = pd.read_csv('../../data/gen_train.csv')
test_df = pd.read_csv('../../data/gen_test.csv')

In [3]:
train_df.head()

Unnamed: 0,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str,u,u_star,u_star_hat,z,optimal_cost,actual_cost
0,7,1.5,0.4,25,13,5,10.657763,3.154803,11.41268,0.933853,9.787753058341407_13.769005652634771_10.544099...,134.208964,119.134863,123.157898,0.967334,6.02964,4.420426
1,2,2.0,0.4,20,35,5,4.607003,4.979874,0.855855,5.382922,6.589942073822911_12.505012974866775_0.9746126...,153.974095,124.175503,147.740416,0.840498,11.919437,2.493471
2,7,2.0,0.1,25,20,5,13.827765,8.330743,2.755095,5.01898,28.421817353800066_8.08111914565828_9.01414832...,287.20138,239.382016,221.378437,1.081325,4.781936,6.582294
3,3,1.0,0.05,25,43,5,2.437155,1.831173,1.771363,1.375864,3.409437374082151_1.015279224032903_1.37928203...,158.165646,102.748549,77.607158,1.323957,2.770855,4.027924
4,2,2.0,0.4,20,34,5,4.09058,2.232379,3.357639,1.218291,4.475167957800123_7.740102799607751_3.43948284...,142.94377,120.199163,124.4973,0.965476,9.097843,7.378588


In [4]:
X_train = train_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_train = train_df['z']

X_test = test_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_test = test_df['z']


In [5]:
model = LinearRegression()

model.fit(X_train, y_train)
display(model.score(X_train, y_train))
display(model.coef_)

0.12325839378876147

array([ 4.74898054e-04,  9.05376577e-17,  4.28868833e-02, -2.49584475e-04,
       -1.98516810e-02, -2.30465763e-02, -1.66147784e-03,  4.03766403e-03,
       -5.95451385e-05])

In [6]:
test_output = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=['predicted_z'])
test_df = test_output.merge(test_df, left_index=True, right_index=True)
test_df['predicted_u_star'] = test_df['predicted_z'] * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
test_df.head()

Unnamed: 0,predicted_z,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str,u,u_star,u_star_hat,z,optimal_cost,actual_cost,predicted_u_star
0,1.215884,2,1.0,0.4,25,31,5,1.608631,1.188836,1.830919,0.878592,0.003058540635843962_2.043043768618266_0.87786...,76.385318,49.639974,38.901501,1.276043,10.698138,11.634241,47.299716
1,1.124154,2,1.5,0.4,30,14,5,4.009268,2.903299,1.906982,2.102416,6.38314683676832_6.791394935951803_0.294959015...,35.315328,28.89717,39.815962,0.725768,2.567263,30.0,44.759282
2,1.039824,5,1.0,0.1,20,12,5,6.821328,3.360037,4.121449,1.65508,2.3952951439570325_10.96045323453524_4.9239916...,62.303999,43.872717,59.535043,0.736923,1.843128,0.039802,61.905979
3,1.213049,2,1.0,0.05,25,46,5,1.505001,0.868517,3.002733,0.50121,0.9910707518986788_0.8247997136843997_2.295715...,76.928575,69.443922,54.293023,1.279058,0.374233,0.553423,65.860121
4,1.15427,2,2.0,0.4,15,36,5,3.557228,2.238024,2.526352,1.408049,7.084869757006029_2.2238474479686117_1.1614816...,152.382992,132.364109,116.206029,1.139047,8.007553,7.299921,134.133189


In [7]:
methods.plot_plotly(data=test_df['actual_cost'], data_label='Actual Cost')
methods.plot_plotly(data=test_df['optimal_cost'], data_label='Optimal Cost')
methods.multi_plot_plotly(data=[test_df['optimal_cost'], test_df['actual_cost']], data_label=['Optimal Cost', 'Actual Cost'])
