In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

# Model
from sklearn.tree import DecisionTreeClassifier

# For reproducible results
RANDOM_STATE_SEED = 732

In [None]:
df_dataset = pd.read_csv("processed_dataset.csv")
df_dataset


In [None]:
# es realmente necesario volver a filtrar los datos si supuestamente el procesado no deveria tener valores infinitos

print(np.any(np.isnan(df_dataset)))
print(np.any(np.isfinite(df_dataset)))

# si trato de usar where infinite, normalmente trae malos resultados onda overflow de memoria
df_dataset.isin([np.inf, -np.inf]).values.sum()

In [None]:
# df_dataset.isinf()
df_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
df_dataset.dropna(inplace=True)


In [None]:
# es realmente necesario volver a filtrar los datos si supuestamente el procesado no deveria tener valores infinitos

print(np.any(np.isnan(df_dataset)))
print(np.any(np.isinf(df_dataset)))

# si trato de usar where infinite, normalmente trae malos resultados onda overflow de memoria
df_dataset.isin([np.inf, -np.inf]).values.sum()

In [None]:
df_dataset.describe()
# df_label = np.array(df_dataset.pop('Label'))
# df_label

In [None]:
df_dataset.info()

In [None]:
y = np.array(df_dataset.pop('Label'))
X = np.array(df_dataset)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
pd.DataFrame(X)

In [None]:
pd.DataFrame(y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X)
pd.DataFrame(X_scaler.transform(X))
X = np.array(X_scaler.transform(X))
X

In [None]:
# X, y = train_test_split(df_dataset, test_size=0.3, random_state=RANDOM_STATE_SEED)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=RANDOM_STATE_SEED)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.utils import class_weight  # For balanced class weighted classification training

# Calculating class weights for balanced class weighted classifier training
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

print(class_weights)

# Must be in dict format for scikitlearn
class_weights = {
    0: class_weights[0],
    1: class_weights[1]
}

print(class_weights)

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:

model = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    # max_features='auto',
    max_features=10,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    # bootstrap=True,
    bootstrap=False,
    oob_score=False,
    n_jobs=None,
    random_state=1,
    verbose=0,
    warm_start=False,
    class_weight=class_weights,
    ccp_alpha=0.0,
    max_samples=None
)

hyperparameters = {
    'n_estimators': [50, 75, 100, 125, 150]
}

In [None]:
clf = GridSearchCV(
    estimator=model,
    param_grid=hyperparameters,
    cv=5,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print("Accuracy score on Validation set: \n")
print(clf.best_score_ )
print("---------------")
print("Best performing hyperparameters on Validation set: ")
print(clf.best_params_)
print("---------------")
print(clf.best_estimator_)

In [None]:
model = clf.best_estimator_
model

In [None]:
predictions = model.predict(X_test)
predictions

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix #, plot_confusion_matrix


In [None]:
print(accuracy_score(y_test, predictions))
cm = confusion_matrix(y_test, predictions)
print(cm)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap=plt.cm.Greens)

In [None]:
print(classification_report(y_test, predictions, digits=5))

In [None]:
!python -m pip install joblib


In [None]:
import joblib


In [None]:
joblib.dump(model, "trained_models/random-forest-classifier.pkl")

In [None]:
model = joblib.load("trained_models/random-forest-classifier.pkl")
model