In [1]:
import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,f1_score

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB

np.random.seed(42)

### Spliting data into known and unknown labels

In [2]:
data = pd.read_csv('./data/raw_data/data.csv', header=None)
labels = pd.read_csv('./data/raw_data/labels.csv', header=None)

data_labelled = data[:len(labels.index)]
data_unlabelled = data[len(labels.index):]

data_labelled.to_csv('./data/processed_data/known_labels.csv')
data_unlabelled.to_csv('./data/processed_data/unknown_labels.csv')

### Transforming target variable into booleans

In [3]:
labels = labels.rename(columns={0: 'target'})
labels['target'] = labels['target'].map({1: False, 2: True})
labels.head()

Unnamed: 0,target
0,False
1,False
2,False
3,False
4,False


### Class ratio and null values

In [4]:
print('Class ratio:\n', labels['target'].value_counts())
print('Columns with null values:\n', data_labelled.columns[data_labelled.isna().any()].tolist())

Class ratio:
 False    156
True      23
Name: target, dtype: int64
Columns with null values:
 []


It seems like theres a high class imbalance however no columns have null values

In [5]:
X = data_labelled.copy()
y = labels.copy()

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (179, 186)
y shape:  (179, 1)


### Split train and test data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### Create a html page to vizualize feature destribution

In [7]:
# ONLY HAS TO RUN ONCE TO GENERATE THE HMTL FILE
# profile = ProfileReport(df, minimal=True)
# profile.to_file("visualization/output.html")

From looking at all the feature histograms and the exploratory analysis that was done, there are quite a lot of features with wide ranges of variance. Scalling features will be important specially for K-nearest neighbours and other distance sensitive models

## Performance Metric

Given that we have an imbalanced dataset recall or precision wouldn't be good enough measures of performance. We decided to go with f1-score given how well it tends to represent the performance of a model, even if it has an umbalanced target.

# Decision Trees

## Baseline Model

Lets just try out a simple decision tree without much pre-processing to have a baseline performance metric.

Max depth should be one of the main parameters to tune, given that the deeper the tree is, the more likely it is to overfit and perform badly on unseen data. We'll use a **stratified 5 fold cross validation** for all of our experiments.

In [8]:
baseline_tree_pipeline = Pipeline(
    [   
        ('smote',SMOTE(random_state=42)),
        ('model',DecisionTreeClassifier(random_state=42))
    ]
)

baseline_tree_search = GridSearchCV(
    estimator = baseline_tree_pipeline,
    param_grid = {
        'model__criterion':['gini','entropy'],
    },
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

baseline_tree_search.fit(X_train,y_train.values.ravel())

print('Best parameters:\n', baseline_tree_search.best_params_)
print('Best score in cross-validation:\n', round(baseline_tree_search.best_score_, 3))
print('Score in test dataset:\n', round(baseline_tree_search.score(X_test, y_test),3))

y_pred = baseline_tree_search.best_estimator_.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters:
 {'model__criterion': 'entropy'}
Best score in cross-validation:
 0.73
Score in test dataset:
 0.8
Confusion marix:
 [[30  1]
 [ 1  4]]


### ANOVA + SMOTE Model

From our exploratory analysis it seems like ANOVA and Relief work the best. Lets start with using ANOVA classification to get the top n features and apply SMOTE to try and combat the class imbalance but creating new instances of the rarest class.

In [9]:
improved_tree_pipeline = Pipeline(
    [
     ('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=42)),
     ('model',DecisionTreeClassifier(random_state=42))
    ]
)

improved_tree_search = GridSearchCV(
    estimator = improved_tree_pipeline,
    param_grid = {
        'selector__k':[5,10,20,50,70,100],
        'model__criterion':['gini','entropy'],
        'model__max_depth':[1,2,3,5,None],
        'model__min_samples_split':[2,3,4],
        'model__min_samples_leaf':[1,2,4],
        'model__max_features':['auto','log2','sqrt',None],
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)
improved_tree_search.fit(X_train, y_train.values.ravel())

decision_tree_best = improved_tree_search.best_estimator_

print('Best parameters:\n', improved_tree_search.best_params_)
print('Best score in cross-validation:\n', round(improved_tree_search.best_score_,3))
print('Score in test dataset:\n', round(improved_tree_search.score(X_test, y_test),3))

y_pred = decision_tree_best.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
Best parameters:
 {'model__criterion': 'gini', 'model__max_depth': 3, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'selector__k': 50}
Best score in cross-validation:
 0.849
Score in test dataset:
 0.6
Confusion marix:
 [[29  2]
 [ 2  3]]


Limiting the depth of the tree helps with having better performance since we're running CV and it punishes trees that overfit with a higher max depth. However, the more tunned tree still seems to overfitting since it performed quite worse on the test dataset.

# K-Nearest Neighbors

## Baseline Model

Just as we did with Decision Trees we'll first create a baseline KNN model.

In [10]:
baseline_knn_pipeline = Pipeline(
    [   
        ('smote',SMOTE(random_state=42)),
        ('model',KNeighborsClassifier())
    ]
)

baseline_knn_search = GridSearchCV(
    estimator = baseline_knn_pipeline,
    param_grid = {
    'model__weights':['uniform','distance'],
    'model__n_neighbors':range(1,21,2),
    'model__metric':['euclidean','manhattan','chebyshev','minkowski']
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)
baseline_knn_search.fit(X_train,y_train.values.ravel())

print('Best parameters:\n', baseline_knn_search.best_params_)
print('Best score in cross-validation:\n', round(baseline_knn_search.best_score_,3))
print('Score in test dataset:\n', round(baseline_knn_search.score(X_test, y_test),3))

y_pred = baseline_knn_search.best_estimator_.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best parameters:
 {'model__metric': 'euclidean', 'model__n_neighbors': 17, 'model__weights': 'uniform'}
Best score in cross-validation:
 0.835
Score in test dataset:
 0.833
Confusion marix:
 [[29  2]
 [ 0  5]]


We didnt get great results perhaps because KNN relies on using a distance function between features and we didn't use a feauture scaler. We've seen that some our features have quite different ranges in which they can vary. Let's use a feature scaller as the first step on our pipeline followed by SMOTE.

In [11]:
improved_knn_pipeline = Pipeline(
    [
    ('scaller',StandardScaler()),
    ('selector',SelectKBest(f_classif)),
    ('smote',SMOTE(random_state=42)),
    ('model',KNeighborsClassifier())
    ]
)

improved_knn_search = GridSearchCV(
    estimator = improved_knn_pipeline,
    param_grid = {
        'selector__k':[5,10,20,50,70,100],
        'model__weights':['uniform','distance'],
        'model__n_neighbors':range(1,21,2),
        'model__metric':['euclidean','manhattan','chebyshev','minkowski']
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)
improved_knn_search.fit(X_train, y_train.values.ravel())

knn_best = improved_knn_search.best_estimator_

print('Best parameters:\n', improved_knn_search.best_params_)
print('Best score in cross-validation:\n', round(improved_knn_search.best_score_,3))
print('Score in test dataset:\n', round(improved_knn_search.score(X_test, y_test),3))

y_pred = knn_best.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
Best parameters:
 {'model__metric': 'euclidean', 'model__n_neighbors': 1, 'model__weights': 'uniform', 'selector__k': 50}
Best score in cross-validation:
 0.971
Score in test dataset:
 0.889
Confusion marix:
 [[31  0]
 [ 1  4]]


Applying a standart scaller and using feature upsampling made a big difference in the performance of the model.

KNN reveals to have better performance than Decision Trees measuring by the f1 score. Interesting to note that here KNN tends to perform better with more features, given that the best results come from using the top 40 in comparison to the top 5 in the decision tree.

# Support Vector Classifier

In [12]:
svc_pipeline = Pipeline(
    [
    ('scaller',StandardScaler()),
    ('selector',SelectKBest(f_classif)),
    ('smote',SMOTE(random_state=42)),
    ('model',SVC(random_state=42))
    ]
)

svc_search = GridSearchCV(
    estimator = svc_pipeline,
    param_grid = {
        'selector__k':[5,10,20,50,70,80,90,100],
        'model__kernel':['linear', 'poly',],
        'model__degree':[2,3,4,5]
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)
svc_search.fit(X_train, y_train.values.ravel())

svc_best = svc_search.best_estimator_

print('Best parameters:\n', svc_search.best_params_)
print('Best score in cross-validation:\n', round(svc_search.best_score_,3))
print('Score in test dataset:\n', round(svc_search.score(X_test, y_test),3))

y_pred = svc_best.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters:
 {'model__degree': 2, 'model__kernel': 'poly', 'selector__k': 50}
Best score in cross-validation:
 0.914
Score in test dataset:
 1.0
Confusion marix:
 [[31  0]
 [ 0  5]]


# Gaussian Naive Baynes

In [13]:
baynes_pipeline = Pipeline(
    [
    ('scaller',StandardScaler()),
    ('selector',SelectKBest(f_classif)),
    ('smote',SMOTE(random_state=42)),
    ('model',GaussianNB())
    ]
)

baynes_search = GridSearchCV(
    estimator = baynes_pipeline,
    param_grid = {
        'selector__k':[5,10,20,50,70,80,90,100],
        'model__var_smoothing': [1e-11, 1e-10, 1e-9]
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)
baynes_search.fit(X_train, y_train.values.ravel())

baynes_best = baynes_search.best_estimator_

print('Best parameters:\n', baynes_search.best_params_)
print('Best score in cross-validation:\n', round(baynes_search.best_score_,3))
print('Score in test dataset:\n', round(baynes_search.score(X_test, y_test),3))

y_pred = baynes_best.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters:
 {'model__var_smoothing': 1e-11, 'selector__k': 70}
Best score in cross-validation:
 0.871
Score in test dataset:
 1.0
Confusion marix:
 [[31  0]
 [ 0  5]]


# Ensemble Models

## Our own ensemble classifier

To create our own ensemble model we'll use the best performing models we've tested so far: k-NN, SVC and Baynes

In [14]:
estimators=[
    # ('decision_tree', make_pipeline(SelectKBest(f_classif, k=50), SMOTE(random_state=42), decision_tree_best)),
    ('knn',make_pipeline(StandardScaler(),SelectKBest(f_classif, k=50), SMOTE(random_state=42), knn_best)),
    ('svc',make_pipeline(StandardScaler(),SelectKBest(f_classif, k=100), SMOTE(random_state=42), svc_best)),
    ('baynes',make_pipeline(StandardScaler(),SelectKBest(f_classif, k=70), SMOTE(random_state=42), baynes_best)),
]

# voting set to hard so that majority wins
ensemble = VotingClassifier(estimators, voting='hard')

ensemble.fit(X_train, y_train.values.ravel())

y_pred = ensemble.predict(X_test)
f1_score = f1_score(y_test, y_pred)
print('Score in test dataset:\n', round(f1_score,3))
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Score in test dataset:
 0.889
Confusion marix:
 [[31  0]
 [ 1  4]]


## Random Forest

In [15]:
random_forest_pipeline = Pipeline(
    [('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=42)),
     ('model',RandomForestClassifier(random_state=42))]
)

random_forest_search = GridSearchCV(
    estimator = random_forest_pipeline,
    param_grid = {
        'selector__k':[70],
        'model__criterion':['gini','entropy'],
        'model__max_depth':[2,3,4,5,6],
        'model__min_samples_split':[2,3,4],
        'model__min_samples_leaf': [1,2,3,4],
        'model__n_estimators':[100,200,500]
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)
random_forest_search.fit(X_train,y_train.values.ravel())
random_forest_best = random_forest_search.best_estimator_

print('Best parameters:\n', random_forest_search.best_params_)
print('Best score in cross-validation:\n', round(random_forest_search.best_score_,3))
print('Score in test dataset:\n', round(random_forest_search.score(X_test, y_test),3))

y_pred = random_forest_best.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best parameters:
 {'model__criterion': 'entropy', 'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 4, 'model__n_estimators': 100, 'selector__k': 70}
Best score in cross-validation:
 0.886
Score in test dataset:
 1.0
Confusion marix:
 [[31  0]
 [ 0  5]]


As expected with random forests being another ensemble model, it performs very well, specially in the test dataset where it predicted 100% of the targets correctly

# Predict on the unlaballed dataset

**Decision Trees**

In [16]:
y_pred_validation = decision_tree_best.predict(data_unlabelled)
pd.DataFrame(y_pred_validation).value_counts()

False    155
True      25
dtype: int64

**K-Nearest Neighbors**

In [17]:
y_pred_validation = knn_best.predict(data_unlabelled)
pd.DataFrame(y_pred_validation).value_counts()

False    159
True      21
dtype: int64

**Gaussian Naive Baynes**

In [18]:
y_pred_validation = baynes_best.predict(data_unlabelled)
pd.DataFrame(y_pred_validation).value_counts()

False    155
True      25
dtype: int64

**Support Vector Classifier**

In [19]:
y_pred_validation = svc_best.predict(data_unlabelled)
pd.DataFrame(y_pred_validation).value_counts()

False    163
True      17
dtype: int64

**Ensemble**

In [20]:
y_pred_validation = ensemble.predict(data_unlabelled)
pd.DataFrame(y_pred_validation).value_counts()

False    161
True      19
dtype: int64

**Random Forest**

In [21]:
y_pred_validation = random_forest_best.predict(data_unlabelled)
pd.DataFrame(y_pred_validation).value_counts()

False    159
True      21
dtype: int64

## **The search for the best model continues in the next notebook**...