# Linear Random Forest

In [1]:
from sklearn.linear_model import LogisticRegression
from lineartree import LinearTreeClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
from joblib import dump
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import time

In [2]:
import sys
sys.path.append('F:\\Users\\Manuel García Plaza\\Desktop\\TFG\\')

In [3]:
from notebooks.utils.classification_metrics import classification
from notebooks.utils.lrf import LinearRandomForestClassifier

In [4]:
train =  pd.read_parquet('../../../data/model_input/train_sets/car_insurance.parquet')
validation =  pd.read_parquet('../../../data/model_input/validation_sets/car_insurance.parquet')

In [5]:
y_train = train.OUTCOME
X_train = train.drop(columns=['OUTCOME'])

In [6]:
y_validation = validation.OUTCOME
X_validation = validation.drop(columns=['OUTCOME'])

In [7]:
scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

In [16]:
n_estimators = [2, 5, 7, 10, 15, 20, 40, 60, 80, 100]

In [17]:
max_depth = [1, 2, 3, 4, 5, 7, 10, 15]
base_mod = [LogisticRegression(penalty=None), LogisticRegression()]

In [18]:
base_est = []
for i in max_depth:
    for mod in base_mod:
        base_est.append(LinearTreeClassifier(base_estimator=mod, max_depth=i))

In [19]:
models_list = []
names_list = []
for n in n_estimators:
    for estim in base_est:
        models_list.append(LinearRandomForestClassifier(n_features=X_train.shape[1], est=estim, n_est=n))
        if estim.base_estimator.penalty == None:
            names_list.append(f'LRF_{n}_{estim.max_depth}')
        else:
            names_list.append(f'LRF_{n}_{estim.max_depth}_{estim.base_estimator.penalty}')

In [20]:
metrics = classification(models_list, names_list, '../../../data/model_output/metrics/car_insurance/lrf.csv', X_train_scaled, y_train, X_validation_scaled, y_validation, metrics={})
metrics

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
LRF_2_1,0.141620,0.808532,0.815945,0.916775
LRF_2_1_l2,0.248335,0.852443,0.840746,-1.372228
LRF_2_2,0.845739,0.831918,0.826797,-0.615512
LRF_2_2_l2,0.447451,0.837718,0.815045,-2.706501
LRF_2_3,0.340658,0.842968,0.832223,-1.274580
...,...,...,...,...
LRF_100_7_l2,29.730645,0.880918,0.876254,-0.529385
LRF_100_10,25.462929,0.882427,0.877737,-0.531516
LRF_100_10_l2,29.163578,0.877122,0.871989,-0.585178
LRF_100_15,32.268150,0.881342,0.877502,-0.435714


In [21]:
metrics.sort_values(by='Validation_AUC', ascending=False)

Unnamed: 0,Run_Time,Train_AUC,Validation_AUC,delta%
LRF_20_10_l2,6.295800,0.889585,0.887939,-0.185038
LRF_60_7,16.903778,0.887183,0.886565,-0.069705
LRF_60_10_l2,16.962084,0.889584,0.886319,-0.366962
LRF_40_2_l2,10.052341,0.887636,0.886044,-0.179347
LRF_40_15,11.151815,0.888797,0.885468,-0.374542
...,...,...,...,...
LRF_2_15,0.431627,0.827671,0.815375,-1.485685
LRF_2_2_l2,0.447451,0.837718,0.815045,-2.706501
LRF_2_3_l2,1.287235,0.812230,0.809873,-0.290137
LRF_2_4_l2,0.466720,0.823363,0.809834,-1.643211


Let's retrain the best model: **LRF_20_10_l2**.

In [22]:
test =  pd.read_parquet('../../../data/model_input/test_sets/car_insurance.parquet')

In [23]:
y_test = test.OUTCOME
X_test = test.drop(columns=['OUTCOME'])

In [24]:
retrain = pd.concat([train, validation])

In [25]:
y_retrain = retrain.OUTCOME
X_retrain = retrain.drop(columns=['OUTCOME'])

In [26]:
scaler_retrain = StandardScaler().fit(X_retrain, y_retrain)

In [27]:
X_retrain_scaled = scaler_retrain.transform(X_retrain)
X_test_scaled = scaler_retrain.transform(X_test)

In [28]:
time1 = time.time()
model = LinearRandomForestClassifier(n_features=X_retrain.shape[1], est=LinearTreeClassifier(base_estimator=LogisticRegression(), max_depth=10), n_est=20).fit(X_retrain_scaled, y_retrain)
time2 = time.time() - time1

In [29]:
test_pred = model.predict_proba(X_test_scaled)[:, 1]
auc = roc_auc_score(y_test, test_pred)
print('Test AUC:', round(auc, 4))

Test AUC: 0.8807


In [30]:
train_pred = model.predict_proba(X_retrain_scaled)[:, 1]
auc_train = roc_auc_score(y_retrain, train_pred)

In [31]:
with open('../../../data/model_output/metrics/car_insurance/final_metrics.txt', 'a') as f:
     f.write(f'\nLRF; {time2}; {auc_train}; {auc}')

In [32]:
dump(model, '../../../data/model_output/models/car_insurance/best_lrf.joblib')

['../../../data/model_output/models/car_insurance/best_lrf.joblib']