In [63]:


import numpy as np
import pandas as pd
import math
import statistics
import random
import importlib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from utils import math_expressions as mexpr, methods

In [64]:
train_df = pd.read_csv('../../data/6gen_train.csv')
test_df = pd.read_csv('../../data/6gen_test.csv')

In [65]:
train_df.head()

Unnamed: 0,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str,u,u_star,u_star_hat,z,optimal_cost,actual_cost
0,2,2.0,0.4,20,11,6,4.685768,4.770839,0.964655,4.857455,2.729330124585387_3.381000767316402_4.51257152...,27.0635,31.161318,34.342779,0.907362,20.0,20.0
1,3,1.0,0.05,30,23,6,2.551518,1.5767,2.618781,0.974315,2.5256728972390525_2.306696258193459_1.3680899...,68.500607,48.775107,40.154071,1.214699,0.986275,1.417327
2,3,1.5,0.1,25,29,6,5.108306,1.836846,7.734077,0.660493,6.009730255289886_2.2924859370871244_6.3770653...,114.963266,103.21193,127.202314,0.8114,1.175134,25.0
3,2,1.5,0.05,20,14,6,1.919946,1.434849,1.790464,1.072318,1.1036590233223538_1.202884444882533_1.8953784...,48.206955,24.236227,14.388398,1.684428,1.198536,1.690928
4,2,1.5,0.4,30,48,6,3.331694,1.289347,6.677138,0.49897,2.3731029980070906_3.981747473831444_1.6576519...,138.079893,124.700221,145.662469,0.85609,5.351869,30.0


## Average Model

In [66]:
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['mean_n'] * row['N']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

Actual Mean cost: 14.60, Actual Median cost: 15.00
Optimal Mean cost: 5.31, Optimal Median cost: 2.29


## Estimated u* from alpha_hat and beta_hat

In [67]:
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['u_star_hat']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

Actual Mean cost: 10.12, Actual Median cost: 5.03
Optimal Mean cost: 5.31, Optimal Median cost: 2.29


## Train and Test for *u_star*

In [68]:
X_train = train_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_train = train_df['u_star']

X_test = test_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_test = test_df['u_star']


## Linear Model for u_star

In [69]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)
print(f'linear_model train accuracy: {linear_model.score(X_train, y_train):.2%}')
print(f'linear_model test accuracy: {linear_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.round(linear_model.coef_, 4)})
display(coef_df)

test_df['predicted_u_star'] = linear_model.predict(X_test)
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

linear_model train accuracy: 91.37%
linear_model test accuracy: 91.22%


Unnamed: 0,Feature,Coefficient
0,N,0.9627
1,n,0.0
2,h,13.5662
3,c,-0.0258
4,mean_n,0.1983
5,std_n,4.5544
6,alpha_hat,-0.0607
7,beta_hat,-7.0255
8,u_star_hat,0.8331


Actual Mean cost: 10.21, Actual Median cost: 5.10
Optimal Mean cost: 5.31, Optimal Median cost: 2.29


## Random Forest for u_star

In [70]:
random_forest_model = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)

random_forest_model.fit(X_train, y_train)
print(f'random_forest_model train accuracy: {random_forest_model.score(X_train, y_train):.2%}')
print(f'random_forest_model test accuracy: {random_forest_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(random_forest_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = random_forest_model.predict(X_test)
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

random_forest_model train accuracy: 97.71%
random_forest_model test accuracy: 91.19%


Unnamed: 0,Feature,Importance
0,N,0.1442
1,n,0.0
2,h,0.0038
3,c,0.0041
4,mean_n,0.1942
5,std_n,0.0582
6,alpha_hat,0.0378
7,beta_hat,0.0146
8,u_star_hat,0.543


Actual Mean cost: 10.10, Actual Median cost: 4.96
Optimal Mean cost: 5.31, Optimal Median cost: 2.29


## Gradient Boost for u_star

In [71]:
gradient_boost_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)

gradient_boost_model.fit(X_train, y_train)
print(f'gradient_boost_model train accuracy: {gradient_boost_model.score(X_train, y_train):.2%}')
print(f'gradient_boost_model test accuracy: {gradient_boost_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(gradient_boost_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = gradient_boost_model.predict(X_test)
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

gradient_boost_model train accuracy: 95.30%
gradient_boost_model test accuracy: 91.20%


Unnamed: 0,Feature,Importance
0,N,0.0084
1,n,0.0
2,h,0.0007
3,c,0.0009
4,mean_n,0.0076
5,std_n,0.0041
6,alpha_hat,0.0059
7,beta_hat,0.0038
8,u_star_hat,0.9685


Actual Mean cost: 10.01, Actual Median cost: 4.87
Optimal Mean cost: 5.31, Optimal Median cost: 2.29


## Train and Test for *z*

In [72]:
X_train = train_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_train = train_df['z']

X_test = test_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_test = test_df['z']


## Linear Regression for z

In [73]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)
print(f'linear_model train accuracy: {linear_model.score(X_train, y_train):.2%}')
print(f'linear_model test accuracy: {linear_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.round(linear_model.coef_, 4)})
display(coef_df)

test_df['predicted_u_star'] = linear_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

linear_model train accuracy: 11.74%
linear_model test accuracy: 11.54%


Unnamed: 0,Feature,Coefficient
0,N,0.0
1,n,0.0
2,h,0.0303
3,c,0.0006
4,mean_n,-0.0055
5,std_n,-0.0703
6,alpha_hat,-0.0044
7,beta_hat,0.0407
8,u_star_hat,0.0


Actual Mean cost: 10.42, Actual Median cost: 5.26
Optimal Mean cost: 5.31, Optimal Median cost: 2.29


## Random Forest for z

In [74]:
random_forest_model = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)

random_forest_model.fit(X_train, y_train)
print(f'random_forest_model train accuracy: {random_forest_model.score(X_train, y_train):.2%}')
print(f'random_forest_model test accuracy: {random_forest_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(random_forest_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = random_forest_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

random_forest_model train accuracy: 80.13%
random_forest_model test accuracy: 29.18%


Unnamed: 0,Feature,Importance
0,N,0.0854
1,n,0.0
2,h,0.0247
3,c,0.0317
4,mean_n,0.3049
5,std_n,0.137
6,alpha_hat,0.1415
7,beta_hat,0.1079
8,u_star_hat,0.1668


Actual Mean cost: 9.87, Actual Median cost: 4.78
Optimal Mean cost: 5.31, Optimal Median cost: 2.29


## Gradient Boost for z

In [75]:
gradient_boost_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)

gradient_boost_model.fit(X_train, y_train)
print(f'gradient_boost_model train accuracy: {gradient_boost_model.score(X_train, y_train):.2%}')
print(f'gradient_boost_model test accuracy: {gradient_boost_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(gradient_boost_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = gradient_boost_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

gradient_boost_model train accuracy: 56.77%
gradient_boost_model test accuracy: 29.64%


Unnamed: 0,Feature,Importance
0,N,0.0257
1,n,0.0
2,h,0.0132
3,c,0.0123
4,mean_n,0.5995
5,std_n,0.0595
6,alpha_hat,0.1163
7,beta_hat,0.0601
8,u_star_hat,0.1133


Actual Mean cost: 9.86, Actual Median cost: 4.75
Optimal Mean cost: 5.31, Optimal Median cost: 2.29
