In [25]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/naomiverkerk/TM10007.git


In [49]:
## Import
from sklearn import model_selection
from sklearn import metrics
from sklearn import feature_selection 
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import svm
from sklearn import decomposition
from load_data import load_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Classifiers and kernels
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report



In [59]:
##Loading Data
data = load_data() 
X = data
X = X.replace(np.inf, np.nan)
Y = data['label']
del X['label']

In [60]:
## Split
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size = 0.2, random_state = 4, stratify = Y)

In [61]:

## Features weghalen met teveel missing values
acceptabele_ratio = 0.5
train_size = len(X_train.index)
removal_rate = round(train_size*acceptabele_ratio)

X_train = X_train.dropna(axis=1, thresh=removal_rate)
common_cols = list(set(X_train.columns).intersection(X_test.columns))
X_test = X_test[common_cols]

In [92]:
## Imputation met median
X_train_missing_median = X_train.fillna(X_train.median())
X_train_missing_median = np.nan_to_num(X_train_missing_median)
X_test_missing_median = X_test.fillna(X_test.median())
X_test_missing_median = np.nan_to_num(X_test_missing_median)

In [93]:
## Scaling
scaler = preprocessing.RobustScaler()
scaler.fit(X_train_missing_median)
X_train_scaled = scaler.transform(X_train_missing_median)
X_train_scaled = np.nan_to_num(X_train_scaled)
X_test_scaled = scaler.transform(X_test_missing_median)
X_test_scaled = np.nan_to_num(X_test_scaled)

In [95]:
## PCA
pca = decomposition.PCA(n_components=0.99, svd_solver= 'full')
pca.fit(X_train_scaled)
X_train_final = pca.transform(X_train_scaled)
X_test_final = pca.transform(X_test_scaled)
explained_variance = np.cumsum(pca.explained_variance_ratio_)


[0.94191425 0.97492092 0.98729222 0.9899666  0.99189023]


## Random Forest

In [None]:



def randomforest(X_train_final, Y_train_final):

    cv_20fold = model_selection.StratifiedKFold(n_splits=5)
    class_names = ['GBM', 'LGG']
    feature_names = list(X_train.columns)
    results = []
    best_min_samples_split = []

    for validation_index, test_index in cv_20fold.split(X_train_final, Y_train_final):

        # Split the data properly
        X_validation = X_train_final[validation_index]
        y_validation = Y_train_final[validation_index]
        
        X_test = X_train_final[test_index]
        y_test = Y_train_final[test_index]

        parameters = {
                        "criterion": ['gini', 'entropy'],
                        "min_samples_split": list(range(2,40,2)),
                        # "min_impurity_decrease": [0,1,2,3,4,5],
                        "max_features": [1,2,3,4,5],
                        "min_samples_leaf": list(range(1,20,2)),
        }
        clf = RandomForestClassifier()
        cv_10fold = model_selection.StratifiedKFold(n_splits=5)
        grid_search = model_selection.GridSearchCV(clf, parameters, cv=cv_10fold, scoring='roc_auc')
        grid_search.fit(X_validation, y_validation)
        
        # Get resulting classifier
        clf = grid_search.best_estimator_
        print(f'Best classifier for criterion={clf.criterion} & min_samples_split={clf.min_samples_split} & max_features={clf.max_features} & min_samples_leaf={clf.min_samples_leaf}')

        probabilities = clf.predict_proba(X_test)
        scores = probabilities[:, 1]
        
        auc = metrics.roc_auc_score(y_test, scores)
        results.append({
            'auc': auc,
            'criterion': clf.criterion,
            'min_samples_split': clf.min_samples_split,
            'max_features': clf.max_features,
            "min_samples_leaf": clf.min_samples_leaf,
            'set': 'test'
        })
        
        probabilities_validation = clf.predict_proba(X_validation)
        scores_validation = probabilities_validation[:, 1]

        # Get the auc
        auc_validation = metrics.roc_auc_score(y_validation, scores_validation)
        results.append({
            'auc': auc_validation,
            'criterion': clf.criterion,
            'min_samples_split': clf.min_samples_split,
            'max_features': clf.max_features,
            "min_samples_leaf": clf.min_samples_leaf,
            'set': 'validation'
        })

    # plt.figure(dpi=250)
    # plot_tree(clf, filled=True,
    #             class_names=class_names,
    #             feature_names=feature_names,
    #             );



    results = pd.DataFrame(results)
    sns.boxplot(y='auc', x='set', data=results)

    p = list(parameters.keys())
    optimal_parameter = []
    # print(f"The optimal N={optimal_n}")

    parameter_keys = list(parameters.keys())
    for item in parameter_keys:
        best_item = [] 
        for i in list(range(0,10,2)):
            best_item.append(results[item][i])

        optimal_parameter.append(statistics.median(best_item))
        print(f"The optimal {item}={optimal_parameter[-1]}")
  
   

## Decision tree

## K-nearest neighbour

## Logistic regression

## Naives Bayes

## SVM

## Neural Network