In [None]:
import statistics

import pandas as pd
import numpy as np
import pydot as pydot
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
import plotly.express as px
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12,6)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn import preprocessing

In [None]:
X_train = pd.read_pickle('x_train_lvl1.pkl')
X_test = pd.read_pickle('x_test_lvl1.pkl')
y_train = np.ravel(pd.read_pickle('y_train_lvl1.pkl'))
y_test = np.ravel(pd.read_pickle('y_test_lvl1.pkl'))
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
X_test.info()

In [None]:
# Prepare data
y_train = ~y_train+2
y_test = ~y_test+2

X_train = np.array(X_train)
X_test = np.array(X_test)

scaler = preprocessing.MinMaxScaler((0, 1)) # normalize data for logistic regression
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ros = RandomOverSampler(random_state=5)
X_resampled, y_resampled = ros.fit_resample(X_scaled, y_train)

print('Training labels shape:', y_resampled.shape)

print('Training features shape:', X_resampled.shape)

# Use stratified cross-validation during model training
skf = StratifiedKFold(n_splits=5, random_state=5, shuffle=True)

# Logistic regression

In [None]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'newton-cholesky'],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

# Create random forest classifier
lr_model = LogisticRegression()

# Tune hyperparameters using GridSearchCV
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, scoring='auc_roc', cv=skf)
grid_search.fit(X_train, y_train)

# Output best hyperparameters
lr_model = grid_search.best_estimator_

In [None]:
lr_model.get_coef

# Random forest

In [None]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 250, 500, 1000, 1250, 1500],
    'max_depth': [5, 10, 15, None],
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Create random forest classifier
rf_model = RandomForestClassifier(criterion="log_loss")

# Tune hyperparameters using GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='auc_roc', cv=skf)
grid_search.fit(X_train, y_train)

# Output best hyperparameters
rf_model = grid_search.best_estimator_


In [None]:
text_representation = tree.export_text(rf_model.estimators_[0])
print(text_representation)

In [None]:
print(rf_model.feature_importances_)
# plot
plt.bar(range(len(rf_model.feature_importances_)), rf_model.feature_importances_)
plt.show()