# RERF

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from joblib import dump
import numpy as np
import time

In [2]:
train =  pd.read_parquet('../../../data/model_input/train_sets/car_insurance.parquet')
validation =  pd.read_parquet('../../../data/model_input/validation_sets/car_insurance.parquet')

In [3]:
y_train = train.OUTCOME
X_train = train.drop(columns=['OUTCOME'])

In [4]:
y_validation = validation.OUTCOME
X_validation = validation.drop(columns=['OUTCOME'])

In [5]:
scaler = StandardScaler().fit(X_train)

In [6]:
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

In [13]:
max_depth = [1, 2, 3, 4, 5, 7, 10, 15, 20]
n_est = [i for i in range(5, 40, 5)]
lr = [LogisticRegression(penalty=None), LogisticRegression(), LogisticRegression(penalty='l1', solver='liblinear'),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.75)]

In [14]:
names_list = []

for n in n_est:
    for depth in max_depth:
        for est in lr:
            if est.penalty == None:
                name = f'RERF_{depth}_{n}'
            elif est.penalty == 'elasticnet':
                name = f'RERF_{depth}_{n}_en_{est.l1_ratio}'
            else:
                name = f'RERF_{depth}_{n}_{est.penalty}'
            
            names_list.append(name)

In [15]:
metrics = {}

In [16]:
it = 0

for n in n_est:
    for i in max_depth:
        for reg in lr:
            name = names_list[it]
            start_time = time.time()
            
            reg.fit(X_train_scaled, y_train)
            reg_pred_train = reg.predict_proba(X_train_scaled)[:, 1]
            reg_pred_val = reg.predict_proba(X_validation_scaled)[:, 1]
            res_train = y_train - reg_pred_train
            res_valid = y_validation - reg_pred_val
            
            rf = RandomForestRegressor(n_estimators=n, max_depth=i)
            rf.fit(X_train_scaled, res_train)
            rf_pred_train = rf.predict(X_train_scaled)
            rf_pred_val = rf.predict(X_validation_scaled)
            
            train_pred = reg_pred_train + rf_pred_train
            val_pred = reg_pred_val + rf_pred_val

            metrics[name] = {
                'Train_AUC': roc_auc_score(y_train, train_pred),
                'Validation_AUC': roc_auc_score(y_validation, val_pred),
                'Run_Time': time.time() - start_time
            }
            
            it += 1

In [17]:
metrics = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_AUC', 'Validation_AUC'])
metrics['delta%'] = 100*(metrics.Validation_AUC - metrics.Train_AUC) / metrics.Train_AUC

In [18]:
metrics

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
RERF_1_5,0.069807,0.910654,0.907896,-0.302792
RERF_1_5_l2,0.047872,0.910546,0.907766,-0.305381
RERF_1_5_l1,0.194480,0.910659,0.907851,-0.308307
RERF_1_5_en_0.25,0.577984,0.910672,0.907898,-0.304580
RERF_1_5_en_0.5,0.206449,0.910547,0.907761,-0.305923
...,...,...,...,...
RERF_20_35_l2,1.650471,0.999786,0.911963,-8.784190
RERF_20_35_l1,1.859064,0.999749,0.910411,-8.936086
RERF_20_35_en_0.25,2.016250,0.999679,0.915029,-8.467761
RERF_20_35_en_0.5,1.799567,0.999541,0.914685,-8.489464


In [19]:
metrics.sort_values(by='Validation_AUC', ascending=False)

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
RERF_7_20_l2,0.336101,0.937293,0.922312,-1.598279
RERF_7_20,0.335037,0.936160,0.921875,-1.525923
RERF_7_30_en_0.75,0.646069,0.936975,0.921693,-1.630968
RERF_4_10_l1,0.253018,0.926377,0.921603,-0.515354
RERF_5_25_en_0.25,0.650511,0.928317,0.921544,-0.729594
...,...,...,...,...
RERF_20_10,0.583439,0.998234,0.891336,-10.708726
RERF_20_5,0.252704,0.993853,0.890067,-10.442746
RERF_20_5_en_0.25,0.672400,0.994094,0.888987,-10.573197
RERF_20_5_l1,0.514060,0.993840,0.884514,-11.000361


Let's choose **RERF_7_20_l2**

In [30]:
metrics.to_csv('../../../data/model_output/metrics/car_insurance/rerf.csv')

In [6]:
test = pd.read_parquet('../../../data/model_input/test_sets/car_insurance.parquet')

In [7]:
y_test = test.OUTCOME
X_test = test.drop(columns=['OUTCOME'])

In [8]:
retrain = pd.concat([train, validation])

In [9]:
y_retrain = retrain.OUTCOME
X_retrain = retrain.drop(columns=['OUTCOME'])

In [10]:
scaler_retrain = StandardScaler().fit(X_retrain)

In [11]:
X_retrain_scaled = scaler_retrain.transform(X_retrain)
X_test_scaled = scaler_retrain.transform(X_test)

In [26]:
# cell for the final best model

lr = LogisticRegression().fit(X_retrain_scaled, y_retrain)
reg_pred_retrain = lr.predict_proba(X_retrain_scaled)[:, 1]
res_retrain = y_retrain - reg_pred_retrain
            
rf = RandomForestRegressor(n_estimators=20, max_depth=7).fit(X_retrain_scaled, res_retrain)

reg_pred_test = lr.predict_proba(X_test_scaled)[:, 1]
res_test = y_test - reg_pred_test
rf_pred_test = rf.predict(X_test_scaled)
            
test_pred = reg_pred_test + rf_pred_test

In [27]:
auc = roc_auc_score(y_test, test_pred)
print('Test AUC:', round(auc, 4))

Test AUC: 0.9269


In [28]:
dump(lr, '../../../data/model_output/models/car_insurance/best_rerf_lr.joblib')
dump(rf, '../../../data/model_output/models/car_insurance/best_rerf_rf.joblib')

['best_rerf_rf.joblib']

In [29]:
with open('../../../data/model_output/metrics/car_insurance/final_AUC.txt', 'a') as f:
    f.write('\nRERF, '+ str(auc))

In [12]:
from joblib import load
load_mod1 = load('../../../data/model_output/models/car_insurance/best_rerf_lr.joblib')
load_mod2 = load('../../../data/model_output/models/car_insurance/best_rerf_rf.joblib')

In [13]:
train_pred = load_mod1.predict_proba(X_retrain_scaled)[:, 1] + load_mod2.predict(X_retrain_scaled)
auc_train = roc_auc_score(y_retrain, train_pred)

test_pred = load_mod1.predict_proba(X_test_scaled)[:, 1] + load_mod2.predict(X_test_scaled)
auc_test = roc_auc_score(y_test, test_pred)

In [14]:
import time 
time1 = time.time()
lr = LogisticRegression().fit(X_retrain_scaled, y_retrain)
reg_pred_retrain = lr.predict_proba(X_retrain_scaled)[:, 1]
res_retrain = y_retrain - reg_pred_retrain
rf = RandomForestRegressor(n_estimators=20, max_depth=7).fit(X_retrain_scaled, res_retrain)
time2 = time.time() - time1 

In [15]:
with open('../../../data/model_output/metrics/car_insurance/final_metrics.txt', 'a') as f:
     f.write(f'\nRERF; {time2}; {auc_train}; {auc_test}')