In [None]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV


# For reproducible results
RANDOM_STATE_SEED = 732

In [None]:
df_dataset = pd.read_csv("processed_dataset_in_3.csv")
df_dataset


In [None]:
# es realmente necesario volver a filtrar los datos si supuestamente el procesado no deveria tener valores infinitos

print(np.any(np.isnan(df_dataset)))
print(np.any(np.isinf(df_dataset)))

# si trato de usar where infinite, normalmente trae malos resultados onda overflow de memoria
df_dataset.isin([np.inf, -np.inf]).values.sum()

In [None]:
# df_dataset.isinf()
df_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
df_dataset.dropna(inplace=True)


In [None]:
# es realmente necesario volver a filtrar los datos si supuestamente el procesado no deveria tener valores infinitos

print(np.any(np.isnan(df_dataset)))
print(np.any(np.isinf(df_dataset)))

# si trato de usar where infinite, normalmente trae malos resultados onda overflow de memoria
df_dataset.isin([np.inf, -np.inf]).values.sum()

In [None]:
df_dataset.describe()
# df_label = np.array(df_dataset.pop('Label'))
# df_label

In [None]:
df_dataset.info()

In [None]:
y = np.array(df_dataset.pop('Label'))
X = np.array(df_dataset)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
pd.DataFrame(X)

In [None]:
pd.DataFrame(y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X)
pd.DataFrame(X_scaler.transform(X))
X = np.array(X_scaler.transform(X))
X

In [None]:
# X, y = train_test_split(df_dataset, test_size=0.3, random_state=RANDOM_STATE_SEED)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_STATE_SEED)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.utils import class_weight  # For balanced class weighted classification training

# Calculating class weights for balanced class weighted classifier training
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

print(class_weights)

# Must be in dict format for scikitlearn
class_weights = {
    0: class_weights[0],
    1: class_weights[1]
}

print(class_weights)

In [None]:

# predictions
# joblib.dump(model, r".\trained_models\remote-random-forest-classifier.pkl")

In [None]:
# model = joblib.load(f".\trained_models\remote-random-forest-classifier")
# model = joblib.load(r".\trained_models\remote-random-forest-classifier.pkl")
# model

In [None]:
!pip install catboost

In [None]:
 # Step 7: Comparing Decision Tree, Random Forest, XGBoost, CatBoost, and LightGBM
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Initialize classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Bagging' : BaggingClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(),
    'LightGBM': LGBMClassifier()
}

In [None]:
### TEMP
classifiers.items()
# for name, clf in classifiers.items()

In [None]:
# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)

    results[name] = {
        'Accuracy': accuracy,
        'Confusion Matrix': confusion_mat,
        'Classification Report': class_report,
        'ROC Curve': (fpr, tpr, roc_auc)
    }

In [None]:
# Bar plot for accuracy comparison
accuracy_values = [result['Accuracy'] for result in results.values()]
classifiers_names = list(classifiers.keys())

plt.figure(figsize=(7, 3))
plt.bar(classifiers_names, accuracy_values, color=['blue', 'green', 'red', 'purple', 'orange'])
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.title('Classifier Accuracy Comparison')
plt.ylim([0, 1])
plt.show()


In [None]:
# Confusion matrices and classification reports
for name, result in results.items():
    print(f'\n{name}:\n')
    # print(f'Confusion Matrix:\n{result["Confusion Matrix"]}\n')
    print(f'Classification Report:\n{result["Classification Report"]}\n')

    # Plot Confusion Matrix with Blues Colormap
    plt.figure(figsize=(4, 2))
    sns.heatmap(result["Confusion Matrix"], annot=True, fmt='g', cmap=plt.cm.Greens, cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {name}')
    plt.show()

In [None]:
 # Plot ROC curves
plt.figure(figsize=(20, 6))
for name, result in results.items():
    fpr, tpr, roc_auc = result['ROC Curve']
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.9f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()