In [16]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

#from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import VotingClassifier

np.random.seed(42)

### Spliting data into known and unknown labels

In [2]:
data = pd.read_csv('./data/raw_data/data.csv', header=None)
labels = pd.read_csv('./data/raw_data/labels.csv', header=None)

data_labelled = data[:len(labels.index)]
data_unlabelled = data[len(labels.index):]

data_labelled.to_csv('./data/processed_data/known_labels.csv')
data_unlabelled.to_csv('./data/processed_data/unknown_labels.csv')

### Merge known labels dataframe with the respective labels

In [3]:
labels = labels.rename(columns={0: 'y'})
df = pd.concat([data_labelled, labels], axis=1)

df['y'] = df['y'].map({1: False, 2: True})

df.to_csv('./data/processed_data/data_with_labels.csv')

### Class ratio and null values

In [4]:
print('Class ratio:\n', labels['y'].value_counts())
print('Columns with null values:\n', df.columns[df.isna().any()].tolist())

Class ratio:
 1    156
2     23
Name: y, dtype: int64
Columns with null values:
 []


It seems like theres a high class imbalance however no columns have null values

### Create a vizualization html page to vizualize feature destribution

In [5]:
# profile = ProfileReport(df, minimal=True)
# profile.to_file("visualization/output.html")

# Decision Trees

## Baseline Model

Lets just try out a simple decision tree without much pre-processing to have a baseline performance metric.

In [6]:
#Defining which metrics to monitor
scoring = ['f1','accuracy','precision','recall']

Max depth should be one of the main parameters to tune, given that the deeper the tree is, the more likely it is to overfit and perform badly on unseen data. We'll use a **5 fold cross validation** for are all experiments.

In [7]:
baseline_tree_pipeline = Pipeline([('model',DecisionTreeClassifier(random_state=42))])

baseline_tree_search = GridSearchCV(
    estimator = baseline_tree_pipeline,
    param_grid = {
        'model__criterion':['gini','entropy'],
        'model__max_depth':range(1,20),
    },
    scoring=scoring,
    refit='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)
baseline_tree_search.fit(data_labelled,labels.values.ravel())

print('Best parameters:\n', baseline_tree_search.best_params_)
print('Best f1 score:\n',baseline_tree_search.best_score_)

Fitting 5 folds for each of 38 candidates, totalling 190 fits
Best parameters:
 {'model__criterion': 'gini', 'model__max_depth': 2}
Best f1 score:
 0.9713164068729908


### ANOVA + SMOTE Model

From our exploratory analysis it seems like ANOVA and Relief work the best. Lets start with using ANOVA classification to get the top n features and apply SMOTE to try and combat the class imbalance but creating new instances of the rarest class.

In [8]:
improved_tree_pipeline = Pipeline(
    [
     ('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=42)),
     ('model',DecisionTreeClassifier(random_state=42))
    ]
)

In [9]:
improved_tree_search = GridSearchCV(
    estimator = improved_tree_pipeline,
    param_grid = {
        'selector__k':[5,10,15,20,30,40,50],
        'model__criterion':['gini','entropy'],
        'model__max_depth':range(1,20),
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

In [10]:
improved_tree_search.fit(data_labelled,labels.values.ravel())

decision_tree_best = improved_tree_search.best_estimator_

print('Best parameters:\n', improved_tree_search.best_params_)
print('Best f1 score:\n',improved_tree_search.best_score_)

Fitting 5 folds for each of 266 candidates, totalling 1330 fits
Best parameters:
 {'model__criterion': 'entropy', 'model__max_depth': 5, 'selector__k': 5}
Best f1 score:
 0.9811492673992674


Seems like the decision tree performs best with a max_depth set to 5 while choosing the top 5 features from the ANOVA analysis. Limiting the depth of the tree helps with having better performance since we're running CV and it punishes trees that overfit with a higher max depth.

# K-Nearest Neighbors 

## Baseline Model

Just as we did with Decision Trees we'll first create a baseline KNN model.

In [11]:
baseline_knn_pipeline = Pipeline([('model',KNeighborsClassifier())])

baseline_knn_search = GridSearchCV(
    estimator = baseline_knn_pipeline,
    param_grid = {
    'model__weights':['uniform','distance'],
    'model__n_neighbors':[1,3,5,7,9],
    'model__metric':['euclidean','manhattan','chebyshev','minkowski']
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

baseline_knn_search.fit(data_labelled,labels.values.ravel())

print('Best parameters:\n', baseline_knn_search.best_params_)
print('Best f1 score:\n',baseline_knn_search.best_score_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best parameters:
 {'model__metric': 'euclidean', 'model__n_neighbors': 1, 'model__weights': 'uniform'}
Best f1 score:
 0.9595192307692308


We got worse results that our previous tree perhaps because KNN relies on using a distance function between features and we didn't use a feauture scaler. Let's use a feature scaller as the first step on our pipeline followed by SMOTE.

In [12]:
improved_knn_pipeline = Pipeline(
    [('scaller',StandardScaler()),
     ('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=42)),
     ('model',KNeighborsClassifier())
    ]
)

In [13]:
improved_knn_search = GridSearchCV(
    estimator = improved_knn_pipeline,
    param_grid = {'selector__k':[5,10,15,20,30,40,50],
    'model__weights':['uniform','distance'],
    'model__n_neighbors':[1,3,5,7,9],
    'model__metric':['euclidean','manhattan','chebyshev','minkowski']
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

In [14]:
improved_knn_search.fit(data_labelled,labels.values.ravel())

knn_best = improved_knn_search.best_estimator_

print('Best parameters:\n', improved_knn_search.best_params_)
print('Best f1 score:\n',improved_knn_search.best_score_)

Fitting 5 folds for each of 280 candidates, totalling 1400 fits
Best parameters:
 {'model__metric': 'chebyshev', 'model__n_neighbors': 7, 'model__weights': 'uniform', 'selector__k': 40}
Best f1 score:
 0.9937484737484737


Applying a standart scaller and using feature upsampling made a big difference in the performance of the model.

KNN reveals to have better performance than Decision Trees measuring by the f1 score. Interesting to note that here KNN tends to perform better with more features, given that the best results come from using the top 40 in comparison to the top 5 in the decision tree.

# Ensemble Models

## Our own ensemble classifier

To create our own ensemble model we'll use the best performing model of each categafory we've tested so far: Decision Trees and KNN.

In [15]:
estimators=[('decision_tree', decision_tree_best), ('knn', knn_best)]
# voting set to hard so that majority wins
ensemble = VotingClassifier(estimators, voting='hard')

## Random Forest

In [18]:
random_forest_pipeline = Pipeline(
    [('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=42)),
     ('model',RandomForestClassifier(random_state=42))]
)

random_forest_search = GridSearchCV(
    estimator = random_forest_pipeline,
    param_grid = {
        'selector__k':[5,10,15,20,50],
        'model__criterion':['gini','entropy'],
        'model__max_depth':[2,5,10,20],
        'model__min_samples_split':range(2,10),
        'model__min_samples_leaf': [1, 2, 4],
        'model__min_samples_split': [2, 5, 10],
        'model__n_estimators':[50,100,200,500]
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

random_forest_search.fit(data_labelled,labels.values.ravel())

random_forest_best = random_forest_search.best_estimator_

print('Best parameters:\n', random_forest_search.best_params_)
print('Best f1 score:\n',random_forest_search.best_score_)

Fitting 5 folds for each of 4410 candidates, totalling 22050 fits
Best parameters:
 {'model__criterion': 'gini', 'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 500, 'selector__k': 50}
Best f1 score:
 0.9842261904761905
