In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

#from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 

from sklearn.tree import DecisionTreeClassifier

seed = np.random.seed(42)

### Spliting data into known and unknown labels

In [2]:
data = pd.read_csv('./data/raw_data/data.csv', header=None)
labels = pd.read_csv('./data/raw_data/labels.csv', header=None)

data_labelled = data[:len(labels.index)]
data_unlabelled = data[len(labels.index):]

data_labelled.to_csv('./data/processed_data/known_labels.csv')
data_unlabelled.to_csv('./data/processed_data/unknown_labels.csv')

### Merge known labels dataframe with the respective labels

In [3]:
labels = labels.rename(columns={0: 'y'})
df = pd.concat([data_labelled, labels], axis=1)

df['y'] = df['y'].map({1: False, 2: True})

df.to_csv('./data/processed_data/data_with_labels.csv')

### Class ratio and null values

In [4]:
print('Class ratio:\n', labels['y'].value_counts())
print('Columns with null values:\n', df.columns[df.isna().any()].tolist())

Class ratio:
 1    156
2     23
Name: y, dtype: int64
Columns with null values:
 []


It seems like theres a high class imbalance however no columns have null values

### Create a vizualization html page for our dataset

In [5]:
profile = ProfileReport(df, minimal=True)
profile.to_file("visualization/output.html")

  return df.reset_index(level=level, drop=drop)
Summarize dataset: 100%|██████████| 34796/34796 [1:17:27<00:00,  7.49it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:28<00:00, 28.32s/it]
Render HTML: 100%|██████████| 1/1 [03:37<00:00, 217.89s/it]
Export report to file: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]


## Decision Trees

Build a pipeline with the ANOVA feature selector > SMOTE > Decision Tree Classifier

In [6]:
pipeline = Pipeline(
    [
     ('selector',SelectKBest(f_classif)),
     ('smote',SMOTE(random_state=seed)),
     ('model',DecisionTreeClassifier(random_state=seed))
    ]
)

### Grid Search

In [7]:
search = GridSearchCV(
    estimator = pipeline,
    param_grid = {'selector__k':range(3,30),
    'model__criterion':['gini','entropy'],
    'model__max_depth':range(1,20),
    'model__min_samples_split':range(2,10),
    'model__min_samples_leaf':range(1,10),
    'model__max_features':['auto', 'sqrt', 'log2', None],
    },
    n_jobs=-1,
    scoring='f1',
    cv=5,
    verbose=1
)

### Check results 

In [8]:
search.fit(data_labelled,labels.values.ravel())

#y_pred = search.best_estimator_.predict(data_labelled)

print('Best parameters:\n', search.best_params_)
print('Best score f1:\n',search.best_score_)

Fitting 5 folds for each of 295488 candidates, totalling 1477440 fits
Best parameters:
 {'model__criterion': 'gini', 'model__max_depth': 18, 'model__max_features': 'log2', 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'selector__k': 15}
Best score f1:
 0.9906730769230769
