In [58]:
from sklearn.pipeline import Pipeline,make_pipeline

## Preprocessing
# from sklearn import feature_selection
from sklearn.model_selection import train_test_split, StratifiedKFold 
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.decomposition import PCA


In [59]:
## Import
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import svm
from sklearn import decomposition
from load_data import load_data

# Classifiers and kernels
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

#
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer, make_column_selector as selector


In [7]:
##Loading Data
data = load_data() 
X = data
X = X.replace(np.inf, np.nan)
Y = data['label']
del X['label']

(224, 7)
(112, 7)


In [None]:
## Split
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size = 0.2, random_state = 4, stratify = Y)

In [None]:
## Features weghalen met teveel missing values
acceptabele_ratio = 0.5
train_size = len(X_train.index)
removal_rate = round(train_size*acceptabele_ratio)

X_train = X_train.dropna(axis=1, thresh=removal_rate)
common_cols = list(set(X_train.columns).intersection(X_test.columns))
X_test = X_test[common_cols]

In [60]:
# Scale X values
X_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', RobustScaler()),
    ('PCA', PCA(n_components=0.99, svd_solver= 'full'))])

# Scale Y values
Y_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('encoder', LabelEncoder())])


In [56]:
# moet type nog aanpassen
preprocessor = ColumnTransformer(transformers=[
        ('X', X_transformer, selector(dtype_include='float64')),
        ('Y', Y_transformer, selector(dtype_include='category'))])

In [None]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())])

In [48]:
models = {
        'KNN': KNeighborsClassifier(), 
        'DecisionTree':DecisionTreeClassifier(),
        'RF': RandomForestClassifier(), 
        'LR': LogisticRegression(),
        'GaussianNB': GaussianNB(),
        'MLPClassifier': MLPClassifier(),
        'SVC': SVC()
}

In [50]:
parameters = {
        'KNN': {
            "n_neighbors": list(range(1, 26, 2)),
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan", "chebyshev", "minkowski"]
        }, 
        'DecisionTree':{
            "criterion": ['gini', 'entropy'],
            "min_samples_split": list(range(2,40,2)),
            "max_features": [1,2,3,4,5],
            "min_samples_leaf": list(range(1,20,2)),
        },
        'RF': {
            "criterion": ['gini', 'entropy'],
            "min_samples_split": list(range(2,40,2)),
            "max_features": [1,2,3,4,5],
            "min_samples_leaf": list(range(1,20,2)),
        }, 
        'LR': {
            "solver": ['newton-cg', 'lbfgs', 'liblinear'],
            "C": [100, 10, 1.0, 0.1, 0.01],
        },
        'GaussianNB': {
            "var_smoothing": list(np.logspace(0,-9, num=101))
        },
        'MLPClassifier': {
            "hidden_layer_sizes": [(50,50,50), (50,100,50), (100,)],
            "activation": ['tanh', 'relu'],
            "solver": ['sgd', 'adam'],
            "alpha": [0.0001, 0.05],
            "learning_rate": ['constant','adaptive'],
        },
        'SVC': SVC()
}

In [None]:
cv_gridsearch = StratifiedKFold(n_splits=5)

In [None]:
grid_search = GridSearchCV(clf, parameters, cv=cv_gridsearch, scoring='roc_auc')