In [1]:


import numpy as np
import pandas as pd
import math
import statistics
import random
import importlib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from utils import math_expressions as mexpr, methods

In [2]:
train_df = pd.read_csv('../../data/gen_train_v3.csv')
test_df = pd.read_csv('../../data/gen_test_v3.csv')

In [3]:
train_df.head()

Unnamed: 0,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str,u,u_star,u_star_hat,z,optimal_cost,actual_cost
0,2,3.0,0.066667,25,17,5,8.233253,2.069397,15.829059,0.520135,8.420891851905406_11.662053997433222_6.8728631...,91.612225,67.285668,120.014121,0.560648,1.62177,25.0
1,6,1.0,0.066667,25,11,5,7.474739,1.073942,48.442932,0.1543,7.60249029088337_9.23720132135181_6.7619471323...,67.68406,47.397612,72.714685,0.65183,1.35243,25.0
2,2,1.5,0.066667,25,21,5,4.116626,1.034699,15.829059,0.260068,4.210445925952703_5.831026998716611_3.43643159...,58.965697,41.634662,74.3326,0.560113,1.155402,25.0
3,2,2.0,0.066667,25,15,5,5.488835,1.379598,15.829059,0.346757,5.613927901270271_7.774702664955481_4.58190879...,55.366589,36.653414,68.969431,0.531444,1.247545,25.0
4,2,3.5,0.066667,25,20,5,9.605462,2.414297,15.829059,0.606825,9.824373827222974_13.605729663672092_8.0183403...,131.695899,97.847747,167.788954,0.58316,2.256543,25.0


## Average Model

In [4]:
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['mean_n'] * row['N']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

Actual Mean cost: 14.24, Actual Median cost: 14.58
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


## Estimated u* from alpha_hat and beta_hat

In [5]:
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['u_star_hat']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

Actual Mean cost: 9.64, Actual Median cost: 5.55
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


## Train and Test for *u_star*

In [6]:
X_train = train_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_train = train_df['u_star']

X_test = test_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_test = test_df['u_star']


## Linear Model for u_star

In [7]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)
print(f'linear_model train accuracy: {linear_model.score(X_train, y_train):.2%}')
print(f'linear_model test accuracy: {linear_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.round(linear_model.coef_, 4)})
display(coef_df)

test_df['predicted_u_star'] = linear_model.predict(X_test)
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

linear_model train accuracy: 93.21%
linear_model test accuracy: 89.46%


Unnamed: 0,Feature,Coefficient
0,N,1.0177
1,n,0.0
2,h,0.0
3,c,0.0
4,mean_n,-0.4723
5,std_n,3.6572
6,alpha_hat,-0.1958
7,beta_hat,-5.5416
8,u_star_hat,0.912


Actual Mean cost: 9.74, Actual Median cost: 5.78
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


## Random Forest for u_star

In [8]:
random_forest_model = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)

random_forest_model.fit(X_train, y_train)
print(f'random_forest_model train accuracy: {random_forest_model.score(X_train, y_train):.2%}')
print(f'random_forest_model test accuracy: {random_forest_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(random_forest_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = random_forest_model.predict(X_test)
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

random_forest_model train accuracy: 98.05%
random_forest_model test accuracy: 88.03%


Unnamed: 0,Feature,Importance
0,N,0.0879
1,n,0.0
2,h,0.0
3,c,0.0
4,mean_n,0.2383
5,std_n,0.1073
6,alpha_hat,0.0311
7,beta_hat,0.0251
8,u_star_hat,0.5103


Actual Mean cost: 9.98, Actual Median cost: 5.91
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


## Gradient Boost for u_star

In [9]:
gradient_boost_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)

gradient_boost_model.fit(X_train, y_train)
print(f'gradient_boost_model train accuracy: {gradient_boost_model.score(X_train, y_train):.2%}')
print(f'gradient_boost_model test accuracy: {gradient_boost_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(gradient_boost_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = gradient_boost_model.predict(X_test)
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

gradient_boost_model train accuracy: 94.53%
gradient_boost_model test accuracy: 88.78%


Unnamed: 0,Feature,Importance
0,N,0.0049
1,n,0.0
2,h,0.0
3,c,0.0
4,mean_n,0.0021
5,std_n,0.0011
6,alpha_hat,0.0027
7,beta_hat,0.0009
8,u_star_hat,0.9883


Actual Mean cost: 9.73, Actual Median cost: 5.70
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


## Train and Test for *z*

In [10]:
X_train = train_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_train = train_df['z']

X_test = test_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_test = test_df['z']


## Linear Regression for z

In [11]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)
print(f'linear_model train accuracy: {linear_model.score(X_train, y_train):.2%}')
print(f'linear_model test accuracy: {linear_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.round(linear_model.coef_, 4)})
display(coef_df)

test_df['predicted_u_star'] = linear_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

linear_model train accuracy: 9.96%
linear_model test accuracy: 10.64%


Unnamed: 0,Feature,Coefficient
0,N,-0.0006
1,n,-0.0
2,h,-0.0
3,c,-0.0
4,mean_n,0.0166
5,std_n,-0.0874
6,alpha_hat,-0.0165
7,beta_hat,0.0543
8,u_star_hat,0.0


Actual Mean cost: 10.04, Actual Median cost: 5.84
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


## Random Forest for z

In [12]:
random_forest_model = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)

random_forest_model.fit(X_train, y_train)
print(f'random_forest_model train accuracy: {random_forest_model.score(X_train, y_train):.2%}')
print(f'random_forest_model test accuracy: {random_forest_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(random_forest_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = random_forest_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

random_forest_model train accuracy: 77.89%
random_forest_model test accuracy: 3.18%


Unnamed: 0,Feature,Importance
0,N,0.0732
1,n,0.0
2,h,0.0
3,c,0.0
4,mean_n,0.2643
5,std_n,0.1518
6,alpha_hat,0.1872
7,beta_hat,0.1409
8,u_star_hat,0.1826


Actual Mean cost: 10.03, Actual Median cost: 5.95
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


## Gradient Boost for z

In [13]:
gradient_boost_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)

gradient_boost_model.fit(X_train, y_train)
print(f'gradient_boost_model train accuracy: {gradient_boost_model.score(X_train, y_train):.2%}')
print(f'gradient_boost_model test accuracy: {gradient_boost_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(gradient_boost_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = gradient_boost_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

gradient_boost_model train accuracy: 35.20%
gradient_boost_model test accuracy: 10.62%


Unnamed: 0,Feature,Importance
0,N,0.0208
1,n,0.0
2,h,0.0
3,c,0.0
4,mean_n,0.5917
5,std_n,0.0325
6,alpha_hat,0.2639
7,beta_hat,0.0345
8,u_star_hat,0.0567


Actual Mean cost: 9.70, Actual Median cost: 5.68
Optimal Mean cost: 4.36, Optimal Median cost: 3.17
