# RERF

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from joblib import dump
import numpy as np
import time

In [2]:
train =  pd.read_parquet('../../../data/model_input/train_sets/software_defect.parquet')
validation =  pd.read_parquet('../../../data/model_input/validation_sets/software_defect.parquet')

In [3]:
y_train = train.defects
X_train = train.drop(columns=['defects'])

In [4]:
y_validation = validation.defects
X_validation = validation.drop(columns=['defects'])

In [5]:
scaler = StandardScaler().fit(X_train)

In [6]:
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

In [22]:
max_depth = [1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40]
n_est = [5, 10, 15, 20, 30, 40, 50, 75, 100, 150, 200]
lr = [LogisticRegression(penalty=None), LogisticRegression(), LogisticRegression(penalty='l1', solver='liblinear'),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25, max_iter=500),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=500),
     LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.75, max_iter=500)]

In [23]:
names_list = []

for n in n_est:
    for depth in max_depth:
        for est in lr:
            if est.penalty == None:
                name = f'RERF_{depth}_{n}'
            elif est.penalty == 'elasticnet':
                name = f'RERF_{depth}_{n}_en_{est.l1_ratio}'
            else:
                name = f'RERF_{depth}_{n}_{est.penalty}'
            
            names_list.append(name)

In [24]:
metrics = {}

In [239]:
it = 0

for n in n_est:
    for i in max_depth:
        for reg in lr:
            name = names_list[it]
            start_time = time.time()
            
            reg.fit(X_train_scaled, y_train)
            reg_pred_train = reg.predict_proba(X_train_scaled)[:, 1]
            reg_pred_val = reg.predict_proba(X_validation_scaled)[:, 1]
            res_train = y_train - reg_pred_train
            res_valid = y_validation - reg_pred_val
            
            rf = RandomForestRegressor(n_estimators=n, max_depth=i)
            rf.fit(X_train_scaled, res_train)
            rf_pred_train = rf.predict(X_train_scaled)
            rf_pred_val = rf.predict(X_validation_scaled)
            
            train_pred = reg_pred_train + rf_pred_train
            val_pred = reg_pred_val + rf_pred_val

            metrics[name] = {
                'Train_AUC': roc_auc_score(y_train, train_pred),
                'Validation_AUC': roc_auc_score(y_validation, val_pred),
                'Run_Time': time.time() - start_time
            }
            
            it += 1

In [236]:
metrics = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_AUC', 'Validation_AUC'])
metrics['delta%'] = 100*(metrics.Validation_AUC - metrics.Train_AUC) / metrics.Train_AUC

In [237]:
metrics.to_csv('../../../data/metrics/software_defect/rerf.csv')

In [238]:
metrics

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
RERF_1_5,0.496654,0.779931,0.780364,0.055503
RERF_1_5_l2,0.498404,0.779931,0.780355,0.054300
RERF_1_5_l1,1.506976,0.779954,0.780430,0.060941
RERF_1_5_en_0.25,9.246708,0.779947,0.780390,0.056853
RERF_1_5_en_0.5,9.214333,0.779937,0.780495,0.071570
...,...,...,...,...
RERF_5_200_l2,44.967897,0.792263,0.790455,-0.228167
RERF_10_200,92.600796,0.822082,0.792084,-3.649061
RERF_10_200_l2,92.640307,0.822395,0.792234,-3.667456
RERF_15_200,140.326600,0.891342,0.790259,-11.340528


In [241]:
metrics.sort_values(by='Validation_AUC', ascending=False)

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
RERF_10_100,43.191579,0.821796,0.792358,-3.582195
RERF_10_150_l2,77.517040,0.822255,0.792336,-3.638654
RERF_10_40_en_0.25,33.055901,0.821273,0.792284,-3.529758
RERF_10_200_l2,92.640307,0.822395,0.792234,-3.667456
RERF_10_30,15.860525,0.820569,0.792161,-3.461960
...,...,...,...,...
RERF_40_5_en_0.75,13.933838,0.991430,0.713894,-27.993556
RERF_40_5,5.917124,0.991409,0.710926,-28.291358
RERF_40_5_l2,5.822602,0.991893,0.710332,-28.386252
RERF_40_5_en_0.25,14.352953,0.986857,0.709712,-28.083589


We are left with **RERF_10_100**

In [242]:
test =  pd.read_parquet('../../../data/model_input/test_sets/software_defect.parquet')

In [243]:
y_test = test.defects
X_test = test.drop(columns=['defects'])

In [244]:
retrain = pd.concat([train, validation])

In [245]:
y_retrain = retrain.defects
X_retrain = retrain.drop(columns=['defects'])

In [246]:
scaler_retrain = StandardScaler().fit(X_retrain)

In [247]:
X_retrain_scaled = scaler_retrain.transform(X_retrain)
X_test_scaled = scaler_retrain.transform(X_test)

In [248]:
# cell for the final best model

lr = LogisticRegression(penalty=None).fit(X_retrain_scaled, y_retrain)
reg_pred_retrain = lr.predict_proba(X_retrain_scaled)[:, 1]
res_retrain = y_retrain - reg_pred_retrain
            
rf = RandomForestRegressor(n_estimators=100, max_depth=10).fit(X_retrain_scaled, res_retrain)

reg_pred_test = lr.predict_proba(X_test_scaled)[:, 1]
res_test = y_test - reg_pred_test
rf_pred_test = rf.predict(X_test_scaled)
            
test_pred = reg_pred_test + rf_pred_test

In [249]:
auc = roc_auc_score(y_test, test_pred)
print('Test AUC:', round(auc, 4))

Test AUC: 0.7913


In [250]:
dump(lr, 'best_rerf_lr.joblib')
dump(rf, 'best_rerf_rf.joblib')

['best_rerf_rf.joblib']

In [251]:
with open('../../../data/metrics/software_defect/final_AUC.txt', 'a') as f:
    f.write('\nRERF, '+ str(auc))