# RERF

In [19]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from joblib import dump
import numpy as np
import time

In [2]:
import sys
sys.path.append('F:\\Users\\Manuel García Plaza\\Desktop\\TFG\\')

In [3]:
from notebooks.utils.classification_metrics import classification

In [4]:
train =  pd.read_parquet('../../../data/model_input/train_sets/breast_cancer.parquet')
validation =  pd.read_parquet('../../../data/model_input/validation_sets/breast_cancer.parquet')

In [5]:
y_train = train.diagnosis
X_train = train.drop(columns=['diagnosis'])

In [6]:
y_validation = validation.diagnosis
X_validation = validation.drop(columns=['diagnosis'])

In [7]:
scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

In [48]:
max_depth = [1, 2, 3, 4, 5, 7, 10, 15, 20]
n_est = [i for i in range(5, 55, 5)]

In [50]:
lr = [LogisticRegression(penalty=None), LogisticRegression(), LogisticRegression(penalty='l1', solver='liblinear'),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25, max_iter=700),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=700),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.75, max_iter=1400)]

In [53]:
names_list = []

for n in n_est:
    for depth in max_depth:
        for est in lr:
            if est.penalty == None:
                name = f'RERF_{depth}_{n}'
            elif est.penalty == 'elasticnet':
                name = f'RERF_{depth}_{n}_en_{est.l1_ratio}'
            else:
                name = f'RERF_{depth}_{n}_{est.penalty}'
            
            names_list.append(name)

In [20]:
metrics = {}

In [57]:
it = 0

for n in n_est:
    for i in max_depth:
        for reg in lr:
            name = names_list[it]
            start_time = time.time()
            
            reg.fit(X_train_scaled, y_train)
            reg_pred_train = reg.predict_proba(X_train_scaled)[:, 1]
            reg_pred_val = reg.predict_proba(X_validation_scaled)[:, 1]
            res_train = y_train - reg_pred_train
            res_valid = y_validation - reg_pred_val
            
            rf = RandomForestRegressor(n_estimators=n, max_depth=i)
            rf.fit(X_train_scaled, res_train)
            rf_pred_train = rf.predict(X_train_scaled)
            rf_pred_val = rf.predict(X_validation_scaled)
            
            train_pred = reg_pred_train + rf_pred_train
            val_pred = reg_pred_val + rf_pred_val

            metrics[name] = {
                'Train_AUC': roc_auc_score(y_train, train_pred),
                'Validation_AUC': roc_auc_score(y_validation, val_pred),
                'Run_Time': time.time() - start_time
            }
            
            it += 1

In [59]:
metrics = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_AUC', 'Validation_AUC'])
metrics['delta%'] = 100*(metrics.Validation_AUC - metrics.Train_AUC) / metrics.Train_AUC

In [61]:
metrics

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
RERF_1_5,0.036982,1.000000,0.977130,-2.286967
RERF_1_5_l2,0.024905,0.999171,0.997494,-0.167860
RERF_1_5_l1,0.017277,0.999198,0.964912,-3.431294
RERF_1_5_en_0.25,0.136633,0.999091,0.994361,-0.473412
RERF_1_5_en_0.5,0.141595,0.999251,0.996241,-0.301283
...,...,...,...,...
RERF_20_50_l2,0.676348,1.000000,0.994361,-0.563910
RERF_20_50_l1,0.670611,1.000000,0.995614,-0.438596
RERF_20_50_en_0.25,0.777831,1.000000,0.995614,-0.438596
RERF_20_50_en_0.5,0.783906,1.000000,0.993734,-0.626566


In [62]:
metrics.to_csv('../../../data/metrics/breast_cancer/rerf.csv')

In [67]:
metrics.sort_values(by='Validation_AUC', ascending=False).head(10)

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
RERF_2_30_en_0.75,0.343082,0.999545,0.998747,-0.079885
RERF_4_15_en_0.25,0.193454,0.999947,0.998747,-0.119971
RERF_2_5_en_0.75,0.283216,0.999438,0.998747,-0.06919
RERF_3_10_en_0.5,0.16855,0.999626,0.998747,-0.087905
RERF_1_15_en_0.75,0.295835,0.999224,0.998747,-0.047794
RERF_2_50_en_0.75,0.403347,0.999492,0.998747,-0.074538
RERF_1_10_en_0.25,0.144641,0.999144,0.99812,-0.102478
RERF_4_30_en_0.5,0.270085,0.99992,0.99812,-0.179961
RERF_15_5_en_0.5,0.192459,1.0,0.99812,-0.18797
RERF_3_20_en_0.5,0.205423,0.999893,0.99812,-0.177291


In [68]:
test = pd.read_parquet('../../../data/model_input/test_sets/breast_cancer.parquet')

In [69]:
y_test = test.diagnosis
X_test = test.drop(columns=['diagnosis'])

In [70]:
retrain = pd.concat([train, validation])

In [71]:
y_retrain = retrain.diagnosis
X_retrain = retrain.drop(columns=['diagnosis'])

In [72]:
scaler_retrain = StandardScaler().fit(X_retrain)

In [73]:
X_retrain_scaled = scaler_retrain.transform(X_retrain)
X_test_scaled = scaler_retrain.transform(X_test)

In [85]:
# cell for the final best model

lr = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1400, l1_ratio=0.75).fit(X_retrain_scaled, y_retrain)
reg_pred_retrain = lr.predict_proba(X_retrain_scaled)[:, 1]
res_retrain = y_retrain - reg_pred_retrain
            
rf = RandomForestRegressor(n_estimators=30, max_depth=2).fit(X_retrain_scaled, res_retrain)

reg_pred_test = lr.predict_proba(X_test_scaled)[:, 1]
res_test = y_test - reg_pred_test
rf_pred_test = rf.predict(X_test_scaled)
            
test_pred = reg_pred_test + rf_pred_test

In [86]:
auc = roc_auc_score(y_test, test_pred)
print('Test AUC:', round(auc, 4))

Test AUC: 0.9832


In [87]:
dump(lr, 'best_rerf_lr.joblib')
dump(rf, 'best_rerf_rf.joblib')

['best_rerf_rf.joblib']

In [88]:
with open('../../../data/metrics/breast_cancer/final_AUC.txt', 'a') as f:
    f.write('\nRERF, '+ str(auc))