# TM10007 Assignment template

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [2]:
# # Run this to use from colab environment
# !pip install -q --upgrade git+https://github.com/karinvangarderen/tm10007_project.git

In [17]:
from load_data import load_data
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
import seaborn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats
from sklearn.svm import SVC
import matplotlib.pyplot as plt

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

features = data.drop(columns=['label'])
label = data.label

# Splitting data in train and test group
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=.2)

# functie van maken??
y_train_bin = []
for val in y_train:
  if val == 'T12':
    y_train_bin.append(0)
  else:
    y_train_bin.append(1) 


The number of samples: 113
The number of columns: 160


## Preprocessing

### Scaling

In [18]:
# Scale the dataset
scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Feature selection

### T-test

In [19]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = X_train.columns) # make df from numpy
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_train.columns)
X_train_scaled_df['Label'] = y_train_bin
X_train_T12 = X_train_scaled_df.groupby('Label').get_group(0)
X_train_T34 = X_train_scaled_df.groupby('Label').get_group(1)
X_train_T12 = X_train_T12.drop(columns = ['Label'])
X_train_T34 = X_train_T34.drop(columns = ['Label'])

# ttest
_,pval = stats.ttest_ind(X_train_T12,X_train_T34)

sig_feat = []
for id, val in enumerate(pval):
  if val < 0.05/X_train_scaled_df.shape[1]:
    sig_feat.append(list(X_train.columns)[id])
print(f'Number of significant different features: {len(sig_feat)}')

X_train_sig = X_train_scaled_df[sig_feat]
X_test_sig = X_test_scaled_df[sig_feat]

# # Pairplot of sign features
# X_train_sig.columns =['Feature'+ str(pc) for pc in range(1,len(sig_feat)+1)]
# X_train_sig['Grade'] = y_train_bin
# pair_plot = seaborn.pairplot(X_train_sig, hue = 'Grade')

Number of significant different features: 22


### RFECV

In [26]:
# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# svc = RandomForestClassifier()

# classifications
rfecv = RFECV(
    estimator=svc, step=1, 
    cv=StratifiedKFold(2),
    scoring='roc_auc')
rfecv.fit(X_train_scaled, y_train_bin)

transf_train = rfecv.transform(X_train_scaled)
transf_test = rfecv.transform(X_test_scaled)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

print(transf_train.shape)
print(transf_test.shape)



TypeError: __init__() missing 1 required positional argument: 'estimator'

In [34]:
rfecv.estimator_.Ranking_

AttributeError: 'RandomForestClassifier' object has no attribute 'Ranking_'

## PCA

In [11]:
N_COMP = 10
pca = PCA(n_components=N_COMP)
pca.fit(X_train_sig)
X_train_pca = pca.transform(X_train_sig)
X_test_pca = pca.transform(X_test_sig)


# seaborn.scatterplot(x=X_train_pca[:,0],y=X_train_pca[:,1],hue=y_train)
# scatter_data = pd.DataFrame(X_train_pca[:,:], columns = ['Principal component' + str(pc) for pc in range(1,N_COMP+1)])
# scatter_data['Stage'] = y_train_bin
# seaborn.pairplot(scatter_data, hue = 'Stage')
# print(scatter_data)

# Classifier

In [12]:
# kNN
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train_pca, y_train)
score_train_kNN = knn.score(X_train_pca, y_train)
score_test_kNN = knn.score(X_test_pca, y_test)
print(f"Training result kNN: {score_train_kNN}")
print(f"Test result kNN: {score_test_kNN}")


# RF with PCA
# clf = RandomForestClassifier(n_estimators=5, bootstrap=True)
# clf.fit(X_train_pca, y_train)
# score_train_RF = clf.score(X_train_pca, y_train)
# score_test_RF = clf.score(X_test_pca, y_test)
# print(f"Training result Random Forest: {score_train_RF}")
# print(f"Test result Random Forest: {score_test_RF}")


# RF without PCA
clf = RandomForestClassifier(n_estimators=5, bootstrap=True)
clf.fit(X_train_scaled, y_train)
score_train_RF = clf.score(X_train_scaled, y_train)
score_test_RF = clf.score(X_test_scaled, y_test)
print(f"Training result Random Forest: {score_train_RF}")
print(f"Test result Random Forest: {score_test_RF}")

# Print result

# print(f"Test result: {score_test}")

Training result kNN: 0.7333333333333333
Test result kNN: 0.7391304347826086
Training result Random Forest: 0.9777777777777777
Test result Random Forest: 0.7391304347826086


In [None]:
-