In [19]:


import numpy as np
import pandas as pd
import math
import statistics
import random
import importlib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from utils import math_expressions as mexpr, methods

In [20]:
train_df = pd.read_csv('../../data/gen_train.csv')
test_df = pd.read_csv('../../data/gen_test.csv')

In [21]:
train_df.head()

Unnamed: 0,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str,u,u_star,u_star_hat,z,optimal_cost,actual_cost
0,7,1.5,0.4,25,13,5,10.657763,3.154803,11.41268,0.933853,9.787753058341407_13.769005652634771_10.544099...,134.208964,119.134863,123.157898,0.967334,6.02964,4.420426
1,2,2.0,0.4,20,35,5,4.607003,4.979874,0.855855,5.382922,6.589942073822911_12.505012974866775_0.9746126...,153.974095,124.175503,147.740416,0.840498,11.919437,2.493471
2,7,2.0,0.1,25,20,5,13.827765,8.330743,2.755095,5.01898,28.421817353800066_8.08111914565828_9.01414832...,287.20138,239.382016,221.378437,1.081325,4.781936,6.582294
3,3,1.0,0.05,25,43,5,2.437155,1.831173,1.771363,1.375864,3.409437374082151_1.015279224032903_1.37928203...,158.165646,102.748549,77.607158,1.323957,2.770855,4.027924
4,2,2.0,0.4,20,34,5,4.09058,2.232379,3.357639,1.218291,4.475167957800123_7.740102799607751_3.43948284...,142.94377,120.199163,124.4973,0.965476,9.097843,7.378588


In [22]:
X_train = train_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_train = train_df['z']

X_test = test_df[['N', 'n', 'h', 'c', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
y_test = test_df['z']


## Linear Model

In [23]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)
print(f'linear_model train accuracy: {linear_model.score(X_train, y_train):.2%}')
print(f'linear_model test accuracy: {linear_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.round(linear_model.coef_, 4)})
display(coef_df)

test_df['predicted_u_star'] = linear_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

linear_model train accuracy: 12.33%
linear_model test accuracy: 11.71%


Unnamed: 0,Feature,Coefficient
0,N,0.0005
1,n,0.0
2,h,0.0429
3,c,-0.0002
4,mean_n,-0.0199
5,std_n,-0.023
6,alpha_hat,-0.0017
7,beta_hat,0.004
8,u_star_hat,-0.0001


Actual Mean cost: 10.68, Actual Median cost: 5.76
Optimal Mean cost: 5.00, Optimal Median cost: 2.67


## Random Forest

In [24]:
random_forest_model = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)

random_forest_model.fit(X_train, y_train)
print(f'random_forest_model train accuracy: {random_forest_model.score(X_train, y_train):.2%}')
print(f'random_forest_model test accuracy: {random_forest_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.round(random_forest_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = random_forest_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

random_forest_model train accuracy: 80.46%
random_forest_model test accuracy: 33.24%


Unnamed: 0,Feature,Coefficient
0,N,0.0853
1,n,0.0
2,h,0.0274
3,c,0.0353
4,mean_n,0.307
5,std_n,0.1378
6,alpha_hat,0.1339
7,beta_hat,0.1067
8,u_star_hat,0.1666


Actual Mean cost: 10.17, Actual Median cost: 5.16
Optimal Mean cost: 5.00, Optimal Median cost: 2.67


## Gradient Boost

In [25]:
gradient_boost_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)

gradient_boost_model.fit(X_train, y_train)
print(f'gradient_boost_model train accuracy: {gradient_boost_model.score(X_train, y_train):.2%}')
print(f'gradient_boost_model test accuracy: {gradient_boost_model.score(X_test, y_test):.2%}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.round(gradient_boost_model.feature_importances_, 4)})
display(coef_df)

test_df['predicted_u_star'] = gradient_boost_model.predict(X_test) * test_df['u_star_hat']
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

gradient_boost_model train accuracy: 55.56%
gradient_boost_model test accuracy: 36.35%


Unnamed: 0,Feature,Coefficient
0,N,0.0477
1,n,0.0
2,h,0.0211
3,c,0.017
4,mean_n,0.6118
5,std_n,0.0517
6,alpha_hat,0.0994
7,beta_hat,0.0507
8,u_star_hat,0.1006


Actual Mean cost: 10.04, Actual Median cost: 5.03
Optimal Mean cost: 5.00, Optimal Median cost: 2.67
