In [11]:
from catboost import CatBoostRegressor, Pool
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

from utils import methods, math_expressions as me

In [12]:
train_df = pd.read_csv('../../data/gen_test_v3.csv')
test_df = pd.read_csv('../../data/gen_test_v3.csv')

In [13]:
train_df.head()

h = 1 / 15
c = 25

In [14]:
from sympy import diff, symbols, lambdify
d = symbols('d')
expr = me.cus_cost(h, c, d)
expr_grad = diff(expr, d)
expr_hess = diff(expr_grad, d)

display(expr)
display(expr_grad)
display(expr_hess)


if True:
    f = lambdify(d, expr)
    f_grad = lambdify(d, expr_grad)
    f_hess = lambdify(d, expr_hess)

else:
    f = lambda x: expr.subs(d, x).evalf(5)
    f_grad = lambda x: expr_grad.subs(d, x).evalf(5)
    f_hess = lambda x: expr_hess.subs(d, x).evalf(5)

Piecewise((-0.0666666666666667*d, d <= -0.286980112508514), (25/(1 + exp(-25*d)), True))

Piecewise((-0.0666666666666667, d <= -0.286980112508514), (625*exp(-25*d)/(1 + exp(-25*d))**2, True))

Piecewise((0, d <= -0.286980112508514), (-15625*exp(-25*d)/(1 + exp(-25*d))**2 + 31250*exp(-50*d)/(1 + exp(-25*d))**3, True))

In [15]:
class CustomLoss:
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        result = []
        for i in range(len(targets)):
            actual = targets[i]
            predicted = approxes[i]

            diff = predicted - actual
            
            der1 = f_grad(diff)
            der2 = f_hess(diff)

            if weights is not None:
                der1 *= weights[i]
                der2 *= weights[i]
            result.append((der1, der2))
        return result
    
class CustomMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):

            actual = target[i]
            predicted = approx[i]

            diff = predicted - actual
            error = f(diff)
            

            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * error

        return error_sum, weight_sum


In [16]:
if True:
    X_train = train_df[['N', 'n', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
    y_train = train_df['u']

    X_test = test_df[['N', 'n', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
    y_test = test_df['u']

else:
    X = train_df[['N', 'n', 'mean_n', 'std_n', 'alpha_hat', 'beta_hat', 'u_star_hat']]
    y = train_df['u_star']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
custom_loss = CustomLoss()
custom_metric = CustomMetric()

model = CatBoostRegressor(
    iterations=1000,
    depth=7,
    learning_rate=0.01,
    eval_metric=custom_metric,
    loss_function=custom_loss,
    early_stopping_rounds=50
)

model.fit(X_train, y_train)

  _check_train_params(params)
  return select([less_equal(d, -0.286980112508514),True], [-0.0666666666666667,625*exp(-25*d)/(1 + exp(-25*d))**2], default=nan)
  return select([less_equal(d, -0.286980112508514),True], [-0.0666666666666667,625*exp(-25*d)/(1 + exp(-25*d))**2], default=nan)
  return select([less_equal(d, -0.286980112508514),True], [-0.0666666666666667,625*exp(-25*d)/(1 + exp(-25*d))**2], default=nan)
  return select([less_equal(d, -0.286980112508514),True], [-0.0666666666666667,625*exp(-25*d)/(1 + exp(-25*d))**2], default=nan)
  return select([less_equal(d, -0.286980112508514),True], [-0.0666666666666667*d,25/(1 + exp(-25*d))], default=nan)


0:	learn: 25.1030734	total: 1.25s	remaining: 20m 44s
1:	learn: 25.1030289	total: 2.48s	remaining: 20m 38s


Training has stopped (degenerate solution on iteration 2, probably too small l2-regularization, try to increase it)


<catboost.core.CatBoostRegressor at 0x16a770a40>

In [18]:
print(f'Best score: {model.best_score_}')
coef_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.round(model.get_feature_importance(), 4)})
display(coef_df)

Best score: {'learn': {'CustomMetric': 25.103028878325574}}


Unnamed: 0,Feature,Importance
0,N,11.2213
1,n,0.0
2,mean_n,17.2843
3,std_n,29.4491
4,alpha_hat,25.1702
5,beta_hat,16.8752
6,u_star_hat,0.0


In [19]:
test_df['predicted_u_star'] = model.predict(X_test)
test_df['actual_cost'] = test_df.apply(lambda row: methods.cal_cost(row['c'], row['h'], row['u'], row['predicted_u_star']), axis=1)
print(f'Actual Mean cost: {test_df['actual_cost'].mean():.2f}, Actual Median cost: {test_df['actual_cost'].median():.2f}')
print(f'Optimal Mean cost: {test_df['optimal_cost'].mean():.2f}, Optimal Median cost: {test_df['optimal_cost'].median():.2f}')

Actual Mean cost: 25.10, Actual Median cost: 21.25
Optimal Mean cost: 4.36, Optimal Median cost: 3.17


In [20]:
test_df.head(10)

Unnamed: 0,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str,u,u_star,u_star_hat,z,optimal_cost,actual_cost,predicted_u_star
0,5,2.5,0.066667,25,29,5,12.932681,4.299833,9.046363,1.4296,9.873611453932869_12.181558790848907_20.439509...,373.970646,308.401152,330.455982,0.933259,4.3713,24.931287,0.001334
1,7,1.5,0.066667,25,16,5,9.336215,3.195497,8.536206,1.09372,8.266907462181138_9.999238032738244_14.2029088...,161.389327,135.114076,121.730806,1.109942,1.751683,10.7592,0.001334
2,7,1.0,0.066667,25,21,5,9.326094,4.260114,4.792446,1.945999,9.216064224518064_8.939377602897661_6.74126765...,143.176863,120.497418,157.110477,0.76696,1.511963,9.545035,0.001334
3,3,2.0,0.066667,25,16,5,6.171769,2.741844,5.066802,1.21808,4.268162630400795_5.646163787763241_10.9732434...,108.552055,67.076424,74.632356,0.898758,2.765042,7.236715,0.001334
4,3,1.0,0.066667,25,30,5,3.408672,3.727784,0.836121,4.07677,2.7139503189213166_1.8650227067252927_9.985086...,82.809339,68.600863,63.23327,1.084886,0.947232,5.520533,0.001338
5,3,1.5,0.066667,25,20,5,3.769411,2.560751,2.166767,1.739648,3.11735382807551_1.9783134491147916_0.91413133...,87.63787,64.876472,50.806139,1.276942,1.517427,5.842435,0.00134
6,3,2.5,0.066667,25,15,5,9.931571,2.022021,24.124854,0.411674,10.139845582668807_13.268196488069144_8.596625...,106.97037,78.709432,130.343916,0.60386,1.884062,7.131269,0.001334
7,3,3.0,0.066667,25,36,5,7.128026,5.330372,1.788231,3.986078,3.956626898229583_1.8282626746323363_13.739765...,334.042238,268.574966,200.318648,1.340739,4.364485,22.269393,0.001336
8,5,3.0,0.066667,25,16,5,18.323201,6.923162,7.004767,2.615819,21.25558763838263_19.034362899181332_8.6877522...,233.670311,190.557184,242.398413,0.786132,2.874208,15.577932,0.001334
9,3,1.5,0.066667,25,15,5,5.778133,2.196956,6.917228,0.835325,8.134566061803032_6.967224733616182_2.94335748...,53.291864,45.443092,67.092193,0.677323,0.523251,3.552702,0.001334
