# Linear Random Forest

In [33]:
from sklearn.linear_model import LogisticRegression
from lineartree import LinearTreeClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
from joblib import dump
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import time

In [2]:
import sys
sys.path.append('F:\\Users\\Manuel García Plaza\\Desktop\\TFG\\')

In [3]:
from notebooks.utils.classification_metrics import classification
from notebooks.utils.lrf import LinearRandomForestClassifier

In [4]:
train =  pd.read_parquet('../../../data/model_input/train_sets/breast_cancer.parquet')
validation =  pd.read_parquet('../../../data/model_input/validation_sets/breast_cancer.parquet')

In [5]:
y_train = train.diagnosis
X_train = train.drop(columns=['diagnosis'])

In [6]:
y_validation = validation.diagnosis
X_validation = validation.drop(columns=['diagnosis'])

In [41]:
n_estimators = [2, 5, 7, 10, 15, 20, 40, 50]

In [42]:
max_depth = [i for i in range(1, 11)]
base_mod = [LogisticRegression(penalty=None), LogisticRegression(), LogisticRegression(penalty='l1', solver='liblinear')]

In [43]:
base_est = []
for i in max_depth:
    for mod in base_mod:
        base_est.append(LinearTreeClassifier(base_estimator=mod, max_depth=i))

In [44]:
models_list = []
names_list = []
for n in n_estimators:
    for estim in base_est:
        models_list.append(LinearRandomForestClassifier(n_features=X_train.shape[1], est=estim, n_est=n))
        if estim.base_estimator.penalty == None:
            names_list.append(f'LRF_{n}_{estim.max_depth}')
        else:
            names_list.append(f'LRF_{n}_{estim.max_depth}_{estim.base_estimator.penalty}')

In [30]:
scaler = StandardScaler().fit(X_train)

In [31]:
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

In [96]:
metrics = classification(models_list, names_list, '../../../data/model_output/metrics/breast_cancer/lrf.csv', X_train_scaled, y_train, X_validation_scaled, y_validation)
metrics

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
LRF_2_1,2.510258,0.987618,0.994987,0.746220
LRF_2_1_l2,2.191172,0.992244,0.991855,-0.039274
LRF_2_1_l1,0.935438,0.997433,0.983083,-1.438684
LRF_2_2,3.328074,0.997192,0.978697,-1.854725
LRF_2_2_l2,2.714741,0.990025,0.967419,-2.283383
...,...,...,...,...
LRF_50_9_l2,98.573160,0.998155,0.995614,-0.254535
LRF_50_9_l1,46.319586,0.998288,0.994361,-0.393424
LRF_50_10,118.985844,1.000000,0.996241,-0.375940
LRF_50_10_l2,96.707689,0.997887,0.996867,-0.102224


In [99]:
metrics.sort_values(by='Validation_AUC', ascending=False)

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
LRF_40_7,89.274482,1.000000,1.000000,0.000000
LRF_7_10_l2,11.153919,0.998663,1.000000,0.133897
LRF_5_5_l2,8.602019,0.997914,0.999373,0.146249
LRF_5_10_l1,4.168139,0.997941,0.999373,0.143565
LRF_20_10,37.350478,1.000000,0.999373,-0.062657
...,...,...,...,...
LRF_2_4_l2,3.041868,0.988233,0.959273,-2.930443
LRF_2_6,3.958415,0.993795,0.957393,-3.662925
LRF_2_8,3.936505,0.987925,0.948622,-3.978405
LRF_2_5_l1,1.558847,0.967801,0.934211,-3.470764


We choose **LRF_5_5_l2** as the best model because it reaches a very good AUC in validation with not very deep trees.

In [100]:
test = pd.read_parquet('../../../data/model_input/test_sets/breast_cancer.parquet')

In [101]:
y_test = test.diagnosis
X_test = test.drop(columns=['diagnosis'])

In [102]:
retrain = pd.concat([train, validation])

In [103]:
y_retrain = retrain.diagnosis
X_retrain = retrain.drop(columns=['diagnosis'])

In [104]:
scaler_retrain = StandardScaler().fit(X_retrain)

In [105]:
X_retrain_scaled = scaler_retrain.transform(X_retrain)
X_test_scaled = scaler_retrain.transform(X_test)

In [108]:
time1 = time.time()
model = LinearRandomForestClassifier(n_features=X_retrain.shape[1], est=LinearTreeClassifier(base_estimator=LogisticRegression(), max_depth=5), n_est=5).fit(X_retrain_scaled, y_retrain)
time2 = time.time() - time1

In [109]:
test_pred = model.predict_proba(X_test_scaled)[:, 1]
auc = roc_auc_score(y_test, test_pred)
print('Test AUC:', round(auc, 4))

Test AUC: 0.9896


In [115]:
train_pred = model.predict_proba(X_retrain_scaled)[:, 1]
auc_train = roc_auc_score(y_retrain, train_pred)

In [119]:
with open('../../../data/model_output/metrics/breast_cancer/final_metrics.txt', 'a') as f:
     f.write(f'\nLRF; {time2}; {auc_train}; {auc}')

In [120]:
dump(model, '../../../data/model_output/models/breast_cancer/best_lrf.joblib')

['../../../data/model_output/models/breast_cancer/best_lrf.joblib']