In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from src import utils
import numpy as np

In [2]:
X_train = utils.deserialize_data("data/processed/X_train_ros.pkl")
y_train = utils.deserialize_data("data/processed/y_train_ros.pkl")


In [16]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    # distance, more distance will have less influence
}

param_grid_rf = {
    'max_depth': [10, 20, 30], # maximum depth of the tree
    'min_samples_split': [10, 20, 100],
    'min_samples_leaf': [5, 10, 20],
    'n_estimators': [50,100,200] # number of trees
}

param_grid_lr = {
    'class_weight': ['balanced', None], # balance proposional, None means one
    "penalty": ["l1", "l2"], # regularization
    "solver": ["liblinear"]
}

In [10]:
knn = KNeighborsClassifier(n_jobs=-1)
random_forest = RandomForestClassifier(n_jobs=-1)
logistic_regression = LogisticRegression(n_jobs=-1)

In [11]:
def train_model(model, param_grid, X_train, y_train):
    # >3 : the fold and candidate parameter indexes are also displayed together with the starting time of the computation.
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, verbose=3)
    grid_search.fit(X_train, y_train)

    return grid_search

In [13]:
class_knn = train_model(model=knn, param_grid=param_grid_knn, X_train=X_train, y_train=y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [14]:
class_random_forest = train_model(model=random_forest, param_grid=param_grid_rf, X_train=X_train, y_train=y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [17]:
class_logistic_regression= train_model(model=logistic_regression, param_grid=param_grid_lr, X_train=X_train, y_train=y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [18]:
utils.serialize_data(class_knn, "models/knn_classifier.pkl")
utils.serialize_data(class_random_forest, "models/random_forest_classifier.pkl")
utils.serialize_data(class_logistic_regression , "models/logistic_regression_classifier.pkl")