# TM10007 Group Assignment Machine Learning
#### Sara Arman, Judith Essenburg, George Franssen, Naomi Verkerk

## Google colab environment

In [65]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/naomiverkerk/TM10007.git

## Import


In [44]:
from sklearn import model_selection
from sklearn import metrics
from sklearn import feature_selection 
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import svm
from sklearn import decomposition
from sklearn.utils.validation import check_array

from load_data import load_data
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


## Loading Data

In [45]:
# Moeten hier nu wel even checken of X niet bevat in welke groep hij behoort
data = load_data() 
X = data
Y = data['label']

# Dit betekent dat ik nu label eruit heb gehaald voor input zodat missing values werkt
# En ook even andere 2 categorical variables zodat het scalen werkt
X.pop('label')
X.pop('VOLUME_ET_OVER_ED')
X.pop('VOLUME_NET_OVER_ED')



print(f'The number of samples/patients: {len(data.index)}')
print(f'The number of columns/features: {len(data.columns)}')


The number of samples/patients: 167
The number of columns/features: 722


## Splitting in train and test data

In [46]:
# the code to split, after that we inspect the data

# Misschien hier nog stratify = Y gebruiken? Zag ik in voorbeeld
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size = 0.2, random_state = 4)


## Checking/exploring the dataset

In [47]:
data.dtypes == 'object'
# Seperate numerical variables and categorical variables
num_vars = data.columns[data.dtypes != 'object']
cat_vars = data.columns[data.dtypes == 'object']

#print(len(num_vars))
#print(len(cat_vars))
#print(cat_vars)

In [48]:
# to check which features have most missing values
missing_values = data[num_vars].isnull().sum().sort_values(ascending=False)
missing_values_multiple = (missing_values > 0).sum()

#print(missing_values)
#print(f'The total number of features with 1 or more missing values is {missing_values_multiple}')




## Imputation --> Missing values

In [49]:
# In order to do scaling and PCA, first the missing values need to be filled in. 
# With scaling, it appeared that some values are infinite, so that is why those will be removed with a large finite number


X_train_missing_mean = X_train.fillna(X_train.mean())
X_train_missing_mean = np.nan_to_num(X_train_missing_mean)
X_test_missing_mean = X_test.fillna(X_test.mean())
X_test_missing_mean = np.nan_to_num(X_test_missing_mean)




## Scaling

In [50]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_missing_mean)
X_train_scaled = scaler.transform(X_train_missing_mean)
X_train_scaled = np.nan_to_num(X_train_scaled)
X_test_scaled = scaler.transform(X_test_missing_mean)
X_test_scaled = np.nan_to_num(X_test_scaled)


  temp **= 2
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2


## Perform a PCA

In [51]:
pca = decomposition.PCA(n_components=2)
pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(type(X_train_pca))
print(type(Y_train))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


## Basic Classifiers

In [55]:
basic_classifiers = [LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), GaussianNB(), LogisticRegression(), KNeighborsClassifier()]
# De SGDClassifier() erbij lukt me nog niet helemaal, dan krijg ik allemaal errors
classifiers_fit = list()

for classifier in basic_classifiers:
    classifier.fit(X_train_pca, Y_train)
    Y_predicted = classifier.predict(X_train_pca)
    if hasattr(classifier,'predict_proba'):
        Y_score = classifier.predict_proba(X_train_pca)[:,1]
    else:
        Y_score = Y_predicted
   
    # Calculate some quantifiable things for classifier
    label = 'GBM'
    auc=metrics.roc_auc_score(Y_train, Y_score)
    accuracy=metrics.accuracy_score(Y_train, Y_predicted)
    F1=metrics.f1_score(Y_train, Y_predicted, pos_label=label)
    precision=metrics.precision_score(Y_train, Y_predicted, pos_label=label)
    recall=metrics.recall_score(Y_train, Y_predicted, pos_label=label)


# accuracy, AUC, f1score, precision, recall
    print(type(classifier))
    print('Acc:' +str(accuracy))
    print('AUC:' +str(auc))
    print('F1:' +str(F1))
    print('precision:' +str(precision))
    print('recall:' +str(recall))

<class 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis'>
Acc:0.8270676691729323
AUC:0.8983739837398375
F1:0.8715083798882682
precision:0.8041237113402062
recall:0.9512195121951219
<class 'sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis'>
Acc:0.8646616541353384
AUC:0.9218077474892395
F1:0.8965517241379309
precision:0.8478260869565217
recall:0.9512195121951219
<class 'sklearn.naive_bayes.GaussianNB'>
Acc:0.8195488721804511
AUC:0.9084170253467241
F1:0.8666666666666666
precision:0.7959183673469388
recall:0.9512195121951219
<class 'sklearn.linear_model._logistic.LogisticRegression'>
Acc:0.8571428571428571
AUC:0.8947871831659493
F1:0.8875739644970414
precision:0.8620689655172413
recall:0.9146341463414634
<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Acc:0.8721804511278195
AUC:0.9450023912003825
F1:0.9017341040462428
precision:0.8571428571428571
recall:0.9512195121951219
