**Import Python Libraries**

In [1]:
import seaborn as sns

In [2]:
import matplotlib.pyplot as plt

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [36]:
import catboost as ctb

**Import python dependencies**

In [6]:
from numpy import mean
from numpy import std
from numpy import asarray

In [7]:
from sklearn.preprocessing import Normalizer

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

In [33]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, r2_score

In [10]:
np.random.seed(7)

In [11]:
from hyperopt import hp

**Setting Parameters of Regression**

In [12]:
ctb_params_reg = {
    'learning_rate' : hp.choice('learning_rate' , np.arange(0.05, 0.31, 0.05)),
    'max_depth' : hp.choice('max_depth' , np.arange(5, 16, 5, dtype = int)),
    'colsample_bylevel' : hp.choice('colsample_bylevel' , np.arange(0.3, 0.8, 0.1)),
    'n_estimators' : 100,
    'eval_metric' : 'RMSE'
}

**Setting fit parameters**

In [13]:
ctb_params_fit = {
    'early_stopping_rounds' : 10,
    'verbose' : False
}

In [14]:
ctb_para = dict()
ctb_para['params_reg'] = ctb_params_reg
ctb_para['params_fit'] = ctb_params_fit
ctb_para['func_loss'] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

**CatBoostRegression model class containing initialization function, process for setting up best parameters, cat_reg and train _reg for training the model**

In [37]:
from hyperopt import fmin, tpe, Trials, STATUS_OK, STATUS_FAIL

class CatOptimizer(object):
    def __init__ (self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        
    def process (self, f_name, space, trials, algo, max_evals):
        fn = getattr(self, f_name)
        try:
            result = fmin(fn = fn, space = space, algo = algo, max_evals = max_evals, trials = trials)
        except Exception as e:
            return { 'status' : STATUS_FAIL,
                    'exception' : str(e)}
        return result, trials
    
    def cat_reg (self, para):
        reg = ctb.CatBoostRegressor(**para['params_reg'])
        return self.train_reg(reg, para)
    
    def train_reg (self, reg, para):
        reg.fit(self.x_train, self.y_train, eval_set = [(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['params_fit'])
        pred = reg.predict(self.x_test)
        loss = para['func_loss'](self.y_test, pred)
        return {'loss' : loss, 'status' : STATUS_OK}

In [16]:
df_t_train = pd.read_csv("2021-NIRF_train.csv")
train_data = pd.DataFrame(df_t_train)
print(train_data.columns.values)

['SCORE' 'SS/20' 'FSR/30' 'FQE/20' 'FRU/30' 'TLR/100' 'PU/35' 'QP/40'
 'IPR/15' 'FPPP/10' 'RP/100' 'GPH/40' 'GUE/15' 'MS/25' 'GPHD/20' 'GO/100'
 'RD/30' 'WD/30' 'ESCS/20' 'PCS/20' 'OI/100' 'PR/100' 'RANK']


In [17]:
test_data = pd.read_csv("2021-NIRF_test.csv")
print(test_data.columns.values)

['SCORE' 'SS/20' 'FSR/30' 'FQE/20' 'FRU/30' 'TLR/100' 'PU/35' 'QP/40'
 'IPR/15' 'FPPP/10' 'RP/100' 'GPH/40' 'GUE/15' 'MS/25' 'GPHD/20' 'GO/100'
 'RD/30' 'WD/30' 'ESCS/20' 'PCS/20' 'OI/100' 'PR/100' 'RANK']


In [18]:
train_labels = train_data['RANK']
test_labels = test_data['RANK']
train_features = train_data.drop('RANK', axis = 1)
test_features = test_data.drop('RANK', axis = 1)

In [19]:
print(train_features.shape)
print(test_features.shape)
print(train_labels.shape)
print(test_labels.shape)

(190, 22)
(20, 22)
(190,)
(20,)


In [20]:
train_X = train_features
test_X = test_features
train_Y = train_labels
test_Y = test_labels

In [21]:
transformer_train = Normalizer().fit(train_X)
train_X = transformer_train.transform(train_X)
t_test = test_X
test_X = transformer_train.transform(test_X)

**Result class which has init function for initialisation , record function to record the result of regression, get_metrics for getting metrics, get_results for getting result**

In [22]:
class Result():
    def __init__ (self, y_true):
        self.results = pd.DataFrame({'true_test' : y_true})
        self.metrics = pd.DataFrame(columns = ('model','rmse','r2'))
    def record (self, model, y_pred):
        y_true = self.results.true_test.values
        y_pred = pd.Series(y_pred, name = model+'_pred')
        self.results = pd.concat([self.results, y_pred], axis = 1)
        rmse = np.sqrt(mean_squared_error(y_true,y_pred))
        r_squared = r2_score(y_true, y_pred)
        row_loc = len(self.metrics) + 1
        self.metrics.loc[row_loc] = [model, rmse, r_squared]
    def get_metrics(self):
        return self.metrics
    def get_results(self):
        return self.results
    
res_r = Result(test_Y)

In [23]:
opt_model = CatOptimizer(train_X, test_X, train_Y, test_Y)
ctb_opt = opt_model.process(f_name = 'cat_reg', space = ctb_para, trials = Trials(), algo = tpe.suggest, max_evals = 100)
print(ctb_opt)

100%|███████████████████████████████████████████████| 100/100 [21:43<00:00, 13.03s/trial, best loss: 7.836680825832662]
({'colsample_bylevel': 3, 'learning_rate': 3, 'max_depth': 0}, <hyperopt.base.Trials object at 0x000002344CF48610>)


In [27]:
best_p = {'learning_rate' : np.arange(0.05, 0.31, 0.05)[ctb_opt[0]['learning_rate']],
         'max_depth' : np.arange(5, 16, 1, dtype = int)[ctb_opt[0]['max_depth']],
         'colsample_bylevel' : np.arange(0.3, 0.8, 0.1)[ctb_opt[0]['colsample_bylevel']]}

In [28]:
model = ctb.CatBoostRegressor( verbose = 0, n_estimators = 100, colsample_bylevel = best_p['colsample_bylevel'],
                             learning_rate = best_p['learning_rate'], max_depth = best_p['max_depth'], 
                             early_stopping_rounds = 10)

In [29]:
model.fit(train_X, train_Y)

<catboost.core.CatBoostRegressor at 0x2344cf758d0>

In [31]:
pred_y = model.predict(test_X)

In [34]:
res_r.record('Cat Boost', pred_y)

In [35]:
res_r.get_metrics().head()

Unnamed: 0,model,rmse,r2
1,Cat Boost,7.663461,0.98169


**Final R2 and RMSE score**