In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
import os

In [None]:
BASE_DIR = os.path.abspath("..")
TRAIN_SET = os.path.join(BASE_DIR,"data", "train_set")


In [None]:
from src.genral import restore_dataframes_from_pickle

[train_set1, validation_set1, train_set2, validation_set2] = restore_dataframes_from_pickle(
    file_names=["fold_1_train.pkl", "fold_1_validation_or_test.pkl", "fold_2_train.pkl", "fold_2_validation_or_test.pkl"],
    folder_path=TRAIN_SET
)

display(train_set1.head())
display(validation_set1.head())
display(train_set2.head())
display(validation_set2.head())

In [None]:
# Train and Validation sets for Fold 1
X_train_fold1 = train_set1.drop(columns=["userId", "is_active"])
y_train_fold1 = train_set1["is_active"]
X_val_fold1 = validation_set1.drop(columns=["userId", "is_active"])
y_val_fold1 = validation_set1["is_active"]

# Train and Validation sets for Fold 2
X_train_fold2 = train_set2.drop(columns=["userId", "is_active"])
y_train_fold2 = train_set2["is_active"]
X_val_fold2 = validation_set2.drop(columns=["userId", "is_active"])
y_val_fold2 = validation_set2["is_active"]

In [None]:
display(train_set1.head())
display(validation_set1.head())
display(train_set2.head())
display(validation_set2.head())

In [None]:
print(X_train_fold1.shape)
print(X_val_fold1.shape)
print(X_train_fold2.shape)
print(X_val_fold2.shape)
X_train_fold1.columns=range(X_train_fold1.shape[1])
X_val_fold1.columns=range(X_train_fold1.shape[1])
X_train_fold2.columns=range(X_train_fold1.shape[1])
X_val_fold2.columns=range(X_train_fold1.shape[1])

In [None]:
X_train_fold1.head()

# list of models to train

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
models = {
    "Logistic Regression": {
        "model": LogisticRegression(class_weight="balanced", random_state=42),
        "params": {
            "penalty": ["l2"],
            "C": [0.01, 0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(class_weight="balanced", random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [5, 10, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss"),
        "params": {
            "n_estimators": [50, 100],
            "max_depth": [3, 5, 10],
            "learning_rate": [0.01, 0.1, 0.2],
            "scale_pos_weight": [1, 2, 5]
        }
    },
    "LightGBM": {
        "model": LGBMClassifier(class_weight="balanced", random_state=42),
        "params": {
            "n_estimators": [50, 100],
            "max_depth": [5, 10, -1],
            "learning_rate": [0.01, 0.1, 0.2]
        }
    }
}




In [None]:
X_train_fold1.head()
y_train_fold1.head()
X_val_fold1.head()

In [None]:
from src.Models import train_and_evaluate

best_model_fold1, best_model_name_fold1, best_params_fold1, best_score_fold1 = train_and_evaluate(
    X_train_fold1, y_train_fold1, X_val_fold1, y_val_fold1, models
)

In [None]:
best_model_fold2, best_model_name_fold2, best_params_fold2, best_score_fold2 = train_and_evaluate(
    X_train_fold2, y_train_fold2, X_val_fold2, y_val_fold2, models
)

In [None]:
Final_DATA = os.path.join(BASE_DIR, "data", "final_data")
[train_set1, validation_set1] = restore_dataframes_from_pickle(
    file_names=["fold_3_train.pkl", "fold_3_validation.pkl"],
    folder_path=Final_DATA
)
X_train_fold1 = train_set1.drop(columns=["userId", "is_active"])
y_train_fold1 = train_set1["is_active"]
X_val_fold1 = validation_set1.drop(columns=["userId", "is_active"])
y_val_fold1 = validation_set1["is_active"]
X_train_fold1.columns=range(X_train_fold1.shape[1])
X_val_fold1.columns=range(X_train_fold1.shape[1])
models = {
    "LightGBM": {
        "model": LGBMClassifier(class_weight="balanced", random_state=42),
        "params": {
            "n_estimators": [100],
            "max_depth": [-1],
            "learning_rate": [0.01]
        }
    }
}
train_and_evaluate(
    X_train_fold2, y_train_fold2, X_val_fold2, y_val_fold2, models
)
