In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV, RepeatedStratifiedKFold, train_test_split
from scipy import stats
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('TrainOnMe_encoded.csv')

In [3]:
def remove_outliers(df: pd.DataFrame, ZSCORE_THRESHOLD: float=3.0):
    z_scores = stats.zscore(df.select_dtypes(include=[np.number]))
    filtered_entries = (np.abs(z_scores) < ZSCORE_THRESHOLD).all(axis=1)
    df = df[filtered_entries]
    return df

In [6]:
encoder_y_dict = dict(zip(df['y'].unique(), range(len(df['y'].unique()))))
decoder_y_dict = {value: key for key, value in encoder_y_dict.items()}

encoder_x7_dict = dict(zip(df['x7'].unique(), range(len(df['x7'].unique()))))
decoder_x7_dict = {value: key for key, value in encoder_x7_dict.items()}

def encoder(df: pd.DataFrame):
    df['y'] = df['y'].map(encoder_y_dict)
    df['x7'] = df['x7'].map(encoder_x7_dict)
    return df

def decoder(df: pd.DataFrame):
    df['y'] = df['y'].map(decoder_y_dict)
    return df

In [8]:
def pipe(X: pd.DataFrame, y: pd.DataFrame, pipeline: Pipeline, param_grid: dict, n_splits: int = 10, scoring: str='accuracy', 
         n_iter: bool = None, shuffle: bool = True, verbose: int = 1, repeated: bool = False, n_repeats: int = 10):

    if repeated:
        cv = RepeatedStratifiedKFold(n_splits=n_splits, shuffle=shuffle, n_repeats=n_repeats)
    else:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=shuffle)

    if n_iter:
        search = RandomizedSearchCV(pipeline, param_grid, n_iter=n_iter, scoring=scoring, cv=cv, verbose=verbose, n_jobs=-1)
    else:
        search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=cv, verbose=verbose, n_jobs=-1)

    search.fit(X, y)

    return search

pre_processing = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components='mle'))
])

In [10]:
pre_processing = Pipeline([
    ('scaler', StandardScaler(), df.select_dtypes(include=['float64']).columns)
    ('encoder', encoder(df=df)),
    ('remove_outliers', remove_outliers(df=df)),
    ('pca', PCA(n_components='mle'))
])

In [14]:
# Logistic Classifier Pipeline and Hyperparameters
lr = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression())
])

lr_param_grid = {
    'logistic__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logistic__C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    'logistic__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

lr_model = {'model': lr, 'param_grid': lr_param_grid}

In [15]:
# Random Forest Classifier Pipeline and Parameters
rf_pipeline = Pipeline([
    ('pre_processing', pre_processing),
    ('rf', RandomForestClassifier())
])

rf_param_grid = {
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__bootstrap': [True, False]
}

rf_model = {'model': rf_pipeline, 'param_grid': rf_param_grid}

In [16]:
# XGBoost Classifier Pipeline and Parameters
xgb_pipeline = Pipeline([
    ('pre_processing', pre_processing),
    ('xgb', XGBClassifier())
])

xgb_param_grid = {
    'xgb__n_estimators': [100, 200, 300, 400, 500],
    'xgb__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'xgb__learning_rate': [0.1, 0.01, 0.001],
    'xgb__subsample': [0.5, 0.7, 1.0],
    'xgb__colsample_bytree': [0.5, 0.7, 1.0],
    'xgb__gamma': [0, 1, 5]
}

xgb_model = {'model': xgb_pipeline, 'param_grid': xgb_param_grid}

In [17]:
main_pipeline = Pipeline(steps=[
    ('pre_processing', pre_processing),
    ('ensemble',    VotingClassifier(estimators=[
                        ('lr', LogisticRegression()),
                        ('rf', RandomForestClassifier()),
                        ('xgb', XGBClassifier())
                    ]))
])

ensemble_param_grid = {
    'ensemble__voting': ['hard', 'soft'],
    'ensemble__lr__C': [0.1, 1.0, 10.0],
    'ensemble__lr__solver': ['liblinear', 'lbfgs'],
    'ensemble__rf__n_estimators': [10, 50, 100],
    'ensemble__rf__max_depth': [None, 10, 20],
    'ensemble__xgb__learning_rate': [0.01, 0.1, 0.3],
    'ensemble__xgb__max_depth': [3, 6, 9]
}

ensamble_model = {'model': main_pipeline, 'param_grid': ensemble_param_grid}

In [18]:
all_models = {  'lr': lr_model, 
                'rf': rf_model, 
                'xgb': xgb_model, 
                'ensemble': ensamble_model
            }

In [20]:
X, y = df.drop('y', axis=1), df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

for model_name, model in all_models.items():
    print(f"Training {model_name} model...")
    
    search = pipe(X=X_train, y=y_train, pipeline=model['model'], param_grid=model['param_grid'], n_iter=10)

    print(f"Best {model_name} model: {search.best_params_}")
    print(f"Best {model_name} model score: {search.best_score_}")

    model['model'].set_params(**search.best_params_)
    model['model'].fit(X_train, y_train)
    y_pred = model['model'].predict(X_test)
  

Training lr model...
Fitting 10 folds for each of 10 candidates, totalling 100 fits


30 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\marwi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\marwi\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\marwi\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\marwi\anaconda3\Lib\site-packages\sklearn\base.py", line

Best lr model: {'logistic__solver': 'newton-cg', 'logistic__penalty': 'none', 'logistic__C': 0.01}
Best lr model score: 0.7699999999999999
Training rf model...
Fitting 10 folds for each of 10 candidates, totalling 100 fits




Best rf model: {'rf__n_estimators': 400, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2, 'rf__max_depth': 10, 'rf__bootstrap': False}
Best rf model score: 0.8387499999999999
Training xgb model...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best xgb model: {'xgb__subsample': 0.7, 'xgb__n_estimators': 500, 'xgb__max_depth': 20, 'xgb__learning_rate': 0.01, 'xgb__gamma': 1, 'xgb__colsample_bytree': 1.0}
Best xgb model score: 0.8574999999999999
Training ensemble model...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best ensemble model: {'ensemble__xgb__max_depth': 6, 'ensemble__xgb__learning_rate': 0.1, 'ensemble__voting': 'soft', 'ensemble__rf__n_estimators': 100, 'ensemble__rf__max_depth': 10, 'ensemble__lr__solver': 'liblinear', 'ensemble__lr__C': 1.0}
Best ensemble model score: 0.8637499999999999


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92        55
           1       0.90      0.93      0.91        80
           2       0.91      0.75      0.82        65

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.89       200
weighted avg       0.89      0.89      0.89       200

