In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

#from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(42)

### Spliting data into known and unknown labels

In [2]:
data = pd.read_csv('./data/raw_data/data.csv', header=None)
labels = pd.read_csv('./data/raw_data/labels.csv', header=None)

data_labelled = data[:len(labels.index)]
data_unlabelled = data[len(labels.index):]

data_labelled.to_csv('./data/processed_data/known_labels.csv')
data_unlabelled.to_csv('./data/processed_data/unknown_labels.csv')

### Merge known labels dataframe with the respective labels

In [3]:
labels = labels.rename(columns={0: 'y'})
df = pd.concat([data_labelled, labels], axis=1)

df['y'] = df['y'].map({1: False, 2: True})

df.to_csv('./data/processed_data/data_with_labels.csv')

### Class ratio and null values

In [4]:
print('Class ratio:\n', labels['y'].value_counts())
print('Columns with null values:\n', df.columns[df.isna().any()].tolist())

Class ratio:
 1    156
2     23
Name: y, dtype: int64
Columns with null values:
 []


It seems like theres a high class imbalance however no columns have null values

### Create a vizualization html page for our dataset

In [5]:
# profile = ProfileReport(df, minimal=True)
# profile.to_file("visualization/output.html")

## Decision Trees

From our exploratory analysis it seems like ANOVA and Relief work the best. Lets start with using ANOVA classification to get the top features and apply SMOTE to try and combat the class imbalance.

Build a pipeline with the ANOVA feature selector > SMOTE > Decision Tree Classifier

In [23]:
tree_pipeline = Pipeline(
    [
     ('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=42)),
     ('model',DecisionTreeClassifier(random_state=42))
    ]
)

### Grid Search

Define the parameters that we'll want to test during our 5 fold cross validation. Lets use F1 score for measuring the performance of our model given the class imbalance.

In [37]:
tree_search = GridSearchCV(
    estimator = tree_pipeline,
    param_grid = {
        'selector__k':[5,10,15,20,30,40,50],
        'model__criterion':['gini','entropy'],
        'model__max_depth':range(1,20),
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

Restricting max_depth should help with preventing overfitting along with a 5 fold cross validation

### Results

In [39]:
tree_search.fit(data_labelled,labels.values.ravel())

#y_pred = search.best_estimator_.predict(data_labelled)

print('Best parameters:\n', tree_search.best_params_)
print('Best score f1:\n',tree_search.best_score_)

Fitting 5 folds for each of 266 candidates, totalling 1330 fits
Best parameters:
 {'model__criterion': 'entropy', 'model__max_depth': 5, 'selector__k': 5}
Best score f1:
 0.9811492673992674


Seems like the decision tree performs best with a max_depth set to 5 while choosing the top 5 features from the ANOVA analysis. Limiting the depth of the tree helps with having better performance since we're running CV and it punishes trees that overfit with a higher max depth.

## K-Nearest Neighbors 

Given that KNN relies on majority voting based on class membership of k nearest neighbors, we should use a feature scaller as the first step on our pipeline.

In [45]:
knn_pipeline = Pipeline(
    [('scaller',StandardScaler()),
     ('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=42)),
     ('model',KNeighborsClassifier())
    ]
)

In [46]:
knn_search = GridSearchCV(
    estimator = knn_pipeline,
    param_grid = {'selector__k':[5,10,15,20,30,40,50],
    'model__weights':['uniform','distance'],
    'model__n_neighbors':[1,3,5,7,9],
    'model__metric':['euclidean','manhattan','chebyshev','minkowski']
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

In [55]:
knn_search.fit(data_labelled,labels.values.ravel())

print('Best parameters:\n', knn_search.best_params_)
print('Best score f1:\n',knn_search.best_score_)

Fitting 5 folds for each of 280 candidates, totalling 1400 fits
Best parameters:
 {'model__metric': 'chebyshev', 'model__n_neighbors': 7, 'model__weights': 'uniform', 'selector__k': 40}
Best score f1:
 0.9937484737484737


KNN reveals to have better performance than Decision Trees measuring by the f1 score. K equal to 7 is the most optimal number of neighbors to consider while using the chebyshev distance with an uniform weight. Interesting to note that here KNN tends to perform better with more features, given that the best results come from using the top 40 in comparison to the top 5 in the decision tree.