In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV, train_test_split
from scipy import stats
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [2]:
def remove_outliers(data: pd.DataFrame, ZSCORE_THREASHOLD: int = 4) -> pd.DataFrame:
    zscore = np.abs(stats.zscore(data.select_dtypes(include=["float", "int"])))
    is_inlier = ~ (zscore > ZSCORE_THREASHOLD).any(axis=1)
    data = data[is_inlier]
    return data

In [3]:
def pipe(X, y, n_splits: int, scoring: str, n_iter):

    pre_processing = Pipeline([
                    ('scaler', StandardScaler()),
                    ('pca', PCA())
                    ])
    
    
    main = Pipeline(steps=[('pre_processing', pre_processing),
                          ('knn', KNeighborsClassifier())
                        ])
                    
    param_grid = {'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31],
                'knn__weights': ['uniform', 'distance'],
                'knn__metric': ['minkowski'],
                'knn__algorithm': ['auto'],
                'knn__p': [1, 3, 4, 5],
                'pre_processing__pca__n_components': ['mle']}  # , 5, 6, 7, 8, 9, 10, 11

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True)
    
    if n_iter is None:
        grid = GridSearchCV(main, param_grid, cv=cv, scoring=scoring, n_jobs=-1, verbose=1)
    else:
        grid = RandomizedSearchCV(main, param_grid, cv=cv, scoring=scoring, n_jobs=-1, verbose=1, n_iter=n_iter)

    grid.fit(X, y)

    # print(grid.best_params_)
    # print(grid.best_score_)
    return grid

In [4]:
def get_grids(X, y, n_iter = None) -> dict:
    grids = {}
    for splits in range(10, 11):
        for scoring in ['accuracy']:  # , 'precision', 'recall', 'f1'
            # print(f'splits: {splits}, scoring: {scoring}')
            grids[(splits, scoring)] = pipe(X, y, splits, scoring, n_iter)
            # print('----------------------------------------')
    return grids

In [5]:
def find_best_model(grids, scoring: str) -> tuple[int, str]:
    best_score = 0
    best_model = None
    best_key = (0, 'foo')   # To avoid PyLance warning
    for keys in grids.keys():
        if scoring in keys:
            if grids[keys].best_score_ > best_score:
                best_score = grids[keys].best_score_
                best_model = grids[keys].best_params_
                best_key = keys
    return best_key

In [6]:
data = pd.read_csv('project_train.csv')
data.drop_duplicates(inplace=True)
data = remove_outliers(data)

In [7]:
X = data.drop(columns=["Label"])
y = data["Label"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)

grids = get_grids(X_train, y_train)
best_key = find_best_model(grids, 'accuracy')
best_model = grids[best_key]

print('The best model in terms of highest accuracy score is:')
print()

for param in best_model.best_params_:
    print(f'{param}: {best_model.best_params_[param]}')
print(f'PCA components {best_model.best_estimator_.named_steps["pre_processing"].named_steps["pca"].n_components_}')

print(f'Number of folds: {best_key[0]}')
print(f'Highest accuracy: {best_model.best_score_}')

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
The best model in terms of highest accuracy score is:

knn__algorithm: auto
knn__metric: minkowski
knn__n_neighbors: 5
knn__p: 1
knn__weights: distance
pre_processing__pca__n_components: mle
PCA components 10
Number of folds: 10
Highest accuracy: 0.8161538461538462
              precision    recall  f1-score   support

           0       0.76      0.88      0.81        50
           1       0.86      0.72      0.78        50

    accuracy                           0.80       100
   macro avg       0.81      0.80      0.80       100
weighted avg       0.81      0.80      0.80       100



In [14]:
grids = get_grids(X, y)
best_key = find_best_model(grids, 'accuracy')
best_model = grids[best_key]

print('The best model in terms of highest accuracy score is:')
print()

for param in best_model.best_params_:
    print(f'{param}: {best_model.best_params_[param]}')
print(f'PCA components {best_model.best_estimator_.named_steps["pre_processing"].named_steps["pca"].n_components_}')

print(f'Number of folds: {best_key[0]}')
print(f'Highest accuracy: {best_model.best_score_}')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
The best model in terms of highest accuracy score is:

knn__algorithm: auto
knn__metric: minkowski
knn__n_neighbors: 15
knn__p: 1
knn__weights: distance
pre_processing__pca__n_components: mle
PCA components 10
Number of folds: 10
Highest accuracy: 0.826530612244898
