In [7]:
import pandas as pd

df = pd.read_csv('../data/preprocessed/data.csv', index_col=0, header=0)

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shape of the training and testing sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (22793, 141)
y_train shape: (22793,)
X_test shape: (5699, 141)
y_test shape: (5699,)


In [9]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression

# Creating a logistic regression model
lr_model = LogisticRegression(solver="sag")

# Performing 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(lr_model, X_train, y_train, cv=cv)

# Printing the mean score and standard deviation
print(f"Cross-Validation Scores: {scores}")
print(f"Mean Score: {scores.mean()}")
print(f"Standard Deviation: {scores.std()}")

Cross-Validation Scores: [0.69116034 0.69159903 0.6999342  0.67902589 0.7064502 ]
Mean Score: 0.6936339318309279
Standard Deviation: 0.009247749718676859


## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV

# Creating a parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet']
}

# Creating a GridSearchCV object
grid_search = GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
)

# Fitting the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Printing the best hyperparameters and cross-validation score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Cross-Validation Score: {grid_search.best_score_}")


50 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

-------------------------------------------------------------------

Best Parameters: {'C': 10, 'penalty': 'l2'}
Cross-Validation Score: 0.6945991327590161




## Random forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Creating a parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2']
}

# Creating a GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42)
)

# Fitting the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Printing the best hyperparameters and cross-validation score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Cross-Validation Score: {grid_search.best_score_}")


Best Parameters: {'max_features': 'sqrt', 'n_estimators': 200}
Cross-Validation Score: 0.9960514673731691


## Export random forest model

In [12]:
import joblib
# Save the model as a .pkl file using joblib
joblib.dump(grid_search.best_estimator_, '../models/random_forest.pkl')

['../models/random_forest.pkl']