# Linear Random Forest

In [1]:
from sklearn.linear_model import LogisticRegression
from lineartree import LinearTreeClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
from joblib import dump
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import time

In [2]:
import sys
sys.path.append('F:\\Users\\Manuel García Plaza\\Desktop\\TFG\\')

In [3]:
from notebooks.utils.classification_metrics import classification
from notebooks.utils.lrf import LinearRandomForestClassifier

In [4]:
train =  pd.read_parquet('../../../data/model_input/train_sets/software_defect.parquet')
validation =  pd.read_parquet('../../../data/model_input/validation_sets/software_defect.parquet')

In [5]:
y_train = train.defects
X_train = train.drop(columns=['defects'])

In [6]:
y_validation = validation.defects
X_validation = validation.drop(columns=['defects'])

In [7]:
scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

In [14]:
n_estimators = [5, 10, 15, 20, 25, 30, 35, 40]

In [11]:
max_depth = [1, 2, 3, 4, 5, 7]
base_mod = [LogisticRegression(penalty=None), LogisticRegression()]

In [12]:
base_est = []
for i in max_depth:
    for mod in base_mod:
        base_est.append(LinearTreeClassifier(base_estimator=mod, max_depth=i))

In [15]:
models_list = []
names_list = []
for n in n_estimators:
    for estim in base_est:
        models_list.append(LinearRandomForestClassifier(n_features=X_train.shape[1], est=estim, n_est=n))
        if estim.base_estimator.penalty == None:
            names_list.append(f'LRF_{n}_{estim.max_depth}')
        else:
            names_list.append(f'LRF_{n}_{estim.max_depth}_{estim.base_estimator.penalty}')

In [43]:
metrics = classification(models_list, names_list, '../../../data/model_output/metrics/software_defect/lrf.csv', X_train_scaled, y_train, X_validation_scaled, y_validation, metrics={})
metrics

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
LRF_5_1,54.184970,0.774885,0.774659,-0.029150
LRF_5_1_l2,34.630958,0.771246,0.772685,0.186667
LRF_5_2,50.014409,0.773966,0.774691,0.093603
LRF_5_2_l2,61.297451,0.775498,0.776636,0.146786
LRF_5_3,59.057609,0.780172,0.779813,-0.045944
...,...,...,...,...
LRF_35_5_l2,448.311662,0.779140,0.779530,0.050066
LRF_40_1,375.545070,0.776980,0.778183,0.154855
LRF_40_1_l2,350.585012,0.773790,0.775047,0.162505
LRF_40_2,480.940110,0.781472,0.782836,0.174536


In [44]:
metrics.sort_values(by='Validation_AUC', ascending=False)

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
LRF_5_7_l2,55.992711,0.784260,0.785755,0.190739
LRF_20_7,213.095556,0.783640,0.785262,0.206900
LRF_25_4,334.916088,0.783268,0.784014,0.095149
LRF_5_4_l2,64.244707,0.782860,0.783965,0.141137
LRF_5_5,52.449600,0.781384,0.783503,0.271196
...,...,...,...,...
LRF_5_7,67.600647,0.773402,0.773613,0.027230
LRF_25_1_l2,200.178724,0.771414,0.773412,0.258953
LRF_5_1_l2,34.630958,0.771246,0.772685,0.186667
LRF_15_1_l2,119.413055,0.768028,0.769622,0.207548


We choose **LRF_5_7_l2**	

In [45]:
test = pd.read_parquet('../../../data/model_input/test_sets/software_defect.parquet')

In [46]:
y_test = test.defects
X_test = test.drop(columns=['defects'])

In [47]:
retrain = pd.concat([train, validation])

In [48]:
y_retrain = retrain.defects
X_retrain = retrain.drop(columns=['defects'])

In [49]:
scaler_retrain = StandardScaler().fit(X_retrain)

In [50]:
X_retrain_scaled = scaler_retrain.transform(X_retrain)
X_test_scaled = scaler_retrain.transform(X_test)

In [51]:
time1 = time.time()
model = LinearRandomForestClassifier(n_features=X_retrain.shape[1], est=LinearTreeClassifier(base_estimator=LogisticRegression(), max_depth=7), n_est=5).fit(X_retrain_scaled, y_retrain)
time2 = time.time() - time1

In [52]:
test_pred = model.predict_proba(X_test_scaled)[:, 1]
auc = roc_auc_score(y_test, test_pred)
print('Test AUC:', round(auc, 4))

Test AUC: 0.786


In [53]:
train_pred = model.predict_proba(X_retrain_scaled)[:, 1]
auc_train = roc_auc_score(y_retrain, train_pred)

In [54]:
with open('../../../data/model_output/metrics/software_defect/final_metrics.txt', 'a') as f:
     f.write(f'\nLRF; {time2}; {auc_train}; {auc}')

In [55]:
dump(model, '../../../data/model_output/models/software_defect/best_lrf.joblib')

['../../../data/model_output/models/software_defect/best_lrf.joblib']