# RANDOM FOREST CLASSIFIER ---------------------------------------------------------------

### Get Train, Valid, Test data

In [None]:
import pandas as pd
from my_libs import lib_tools as pt

# run_type = 'dev'
run_type = 'prd'
gen_sample = True
find_best_params = True

X_train, y_train, X_valid, y_valid, X_test, y_test = pt.get_train_valid_test_data(run_type)
print("Train, valid and Test data loaded")

### Resample data with SMOTEN()

In [None]:
if gen_sample:
    X_train_rs, y_train_rs = pt.get_data_resampled(X=X_train, y=y_train, verbose=1)
    # Save data generated
    X_train_rs.to_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train_rs.to_pickle(f'./pickles/y_train_smote_{run_type}.pkl')
else:
    # Load data previously generated
    X_train_rs = pd.read_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train_rs = pd.read_pickle(f'./pickles/y_train_smote_{run_type}.pkl')

### Encode categorical data (target and one hot encoding)

In [None]:
from my_libs.encoder_custom import  EncoderCustom

cols_target_encoded = ['dep']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)

X_train_rs_enc, y_train_rs_enc = encoder.transform(X=X_train_rs, y=y_train_rs, datatype='Train')
X_valid_enc, y_valid_enc = encoder.transform(X=X_valid, y=y_valid, datatype='Test')
X_test_enc , y_test_enc  = encoder.transform(X=X_test , y=y_test , datatype='Test')

### Find best hyperparameters for model with Optuna

In [None]:
import time
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

if find_best_params:

    n_trials = 10
    start_time = time.time()

    def objective(trial):

        dt_n_estimators = trial.suggest_int('n_estimators', 50, 150)
        dt_criterion    = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        dt_max_depth = trial.suggest_int('max_depth', 2, 20, log=True)
        dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
        classifier_obj = RandomForestClassifier(n_estimators=dt_n_estimators, criterion=dt_criterion, max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)

        score = cross_val_score(classifier_obj, X_train_rs_enc, y_train_rs_enc, cv=5, scoring="f1", verbose=1)
        accuracy = score.mean()

        return accuracy

    # Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    print(f"\n--- Random Forest Classifier - Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
    print(f"Best params : {study.best_params}")

In [None]:
if find_best_params:
    from optuna.visualization import plot_optimization_history
    fig = plot_optimization_history(study)
    fig.show()

### Confusion Matrix

In [None]:
from my_libs.model_evaluator import ModelEvaluator

if find_best_params:

    params = study.best_params

    evaluator = ModelEvaluator(model_type='RandomForestClassifier', params=params, X_train=X_train_rs_enc, y_train=y_train_rs_enc, X_test=X_test_enc, y_test=y_test_enc)
    model = evaluator.evaluate()

### Fit best model - Plot Train and Test learning curves

In [None]:
import numpy as np
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
%matplotlib inline

# Train the best model
params = study.best_params
model = RandomForestClassifier(**params)
model.fit(X_train, y_train)

# Compute the learning curve
train_sizes, train_scores, valid_scores = learning_curve(model, X_train_rs_enc, y_train_rs_enc, cv=5, scoring='roc_auc', n_jobs=-1)

# Calculate the mean and standard deviation of the training and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std  = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std  = np.std(valid_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(8, 6))
plt.title('Random Forest Classifier Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score')
plt.plot(train_sizes, valid_scores_mean, 'o-', color='g', label='Validation Score')
plt.legend(loc='best')
plt.show()

### Save model to h5

In [None]:
from joblib import dump
# Save the model to an h5 file using joblib
dump(model, f'h5_models/model_rf_{run_type}.h5')