# TM10007 Assignment template

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [23]:
# import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.decomposition import PCA


# Classifiers
from worclipo.load_data import load_data

### functions

In [24]:
# function to split the dataset into train and test
def split_set(X,y,test_size):

    if os.path.exists('./TEST_set.csv'):
        split_action = print('TEST_set.csv already exists')
    else:
        split_action = print('TEST_set.csv does not exist, generating new test and training sets')
        X_train_csv, X_test_csv, y_train_csv, y_test_csv = train_test_split(X, y, test_size=test_size, random_state=10)

        TESTSET = X_test_csv.merge(y_test_csv, left_index=True, right_index=True)
        TESTSET.to_csv('TEST_set.csv')

        TRAINSET = X_train_csv.merge(y_train_csv, left_index=True, right_index=True)
        TRAINSET.to_csv('TRAIN_set.csv')
        return split_action
    

# setting up the data to be processed

In [25]:
data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
print(type(data))

# change lipoma = 1 and liposarcoma = 0 and encode labels
group_names = list(set(data.label))
data.loc[data['label'] == 'lipoma', 'label'] = 1
data.loc[data['label'] == 'liposarcoma', 'label'] = 0
data['label'] = pd.cut(data['label'], bins = 2, labels=group_names)
print(data['label'].unique())
label_diag = LabelEncoder()
data['label'] = label_diag.fit_transform(data['label'])

# assign X to measurements and y to outcome (lipoma/sarcoma)
X = data.drop('label', axis=1)
y = data['label']
test_size = 0.3

The number of samples: 115
The number of columns: 494
<class 'pandas.core.frame.DataFrame'>
['liposarcoma', 'lipoma']
Categories (2, object): ['liposarcoma' < 'lipoma']


In [26]:
# code that splits the data into test and validation sets if this is not done already
split_set(X,y,test_size)

TEST_set.csv already exists


## import the training set

In [27]:
TRAIN = pd.read_csv('TRAIN_set.csv', index_col=0)
X_train = TRAIN.drop('label', axis=1)
y_train = TRAIN['label']

TRAIN['label'] = pd.cut(TRAIN['label'], bins = 2, labels=group_names)
print(TRAIN['label'].unique())
label_diag = LabelEncoder()
TRAIN['label'] = label_diag.fit_transform(TRAIN['label'])

# split into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=10)

print(f'Size before preprocess: ', X_train.shape)


['liposarcoma', 'lipoma']
Categories (2, object): ['liposarcoma' < 'lipoma']
Size before preprocess:  (56, 493)


# Classify

In [33]:
N_FEATURES_OPTIONS = [15, 18, 19, 20, 22, 25]

# set up pipeline steps
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
scaler = StandardScaler()
variance = VarianceThreshold(threshold=0)
pca = PCA()

# set up classifiers
clf1 = RandomForestClassifier(random_state=1)
clf2 = SVC(probability=True, random_state=1, max_iter=1000)
clf3 = LogisticRegression(random_state=1)
clf4 = DecisionTreeClassifier(random_state=1)
clf5 = KNeighborsClassifier()
clf6 = MultinomialNB()

pipe = Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('variance', variance), ('pca', pca), ('classifier', clf2)])

# set up parameters
param1 = {}
param1['classifier__n_estimators'] = [1,10, 50, 100, 250]
param1['classifier__max_depth'] = [5, 10, 20]
param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param1['pca__n_components'] = N_FEATURES_OPTIONS
param1['classifier'] = [clf1]

param2 = {}
param2['classifier__C'] = [0.1, 1, 10, 100]
param2['classifier__class_weight'] = [None]
param2['classifier__kernel'] = ['sigmoid']
param2['pca__n_components'] = N_FEATURES_OPTIONS
param2['classifier'] = [clf2]

param3 = {}
param3['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param3['classifier__penalty'] = [None, 'l2']
param3['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:6}, {0:1,1:7}]
param3['pca__n_components'] = N_FEATURES_OPTIONS
param3['classifier'] = [clf3]

param4 = {}
param4['classifier__max_depth'] = [5,10,25,None]
param4['classifier__min_samples_split'] = [2,5,10]
param4['classifier__class_weight'] = [{0:1,1:2}, {0:1,1:3}, {0:1,1:4}, {0:1,1:5}]
param4['pca__n_components'] = N_FEATURES_OPTIONS
param4['classifier'] = [clf4]

param5 = {}
param5['classifier__n_neighbors'] = [2,5,10,25,50]
param5['pca__n_components'] = N_FEATURES_OPTIONS
param5['classifier'] = [clf5]

param6 = {}
param6['classifier__alpha'] = [10**0, 10**1, 10**2]
param6['pca__n_components'] = N_FEATURES_OPTIONS
param6['classifier'] = [clf6]

param_grid = [param1, param2, param3, param4, param5, param6]

grid = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

# Test data performance
print("Test Precision:",precision_score(grid.predict(X_valid), y_valid))
print("Test Recall:",recall_score(grid.predict(X_valid), y_valid))
print("Test ROC AUC Score:",roc_auc_score(grid.predict(X_valid), y_valid))

best_clf = grid.best_estimator_
best_clf.fit(X_train,y_train)
pred_rfc = best_clf.predict(X_valid)

print('rfc', classification_report(y_valid, pred_rfc))
print(confusion_matrix(y_valid, pred_rfc))

results = pd.DataFrame(grid.cv_results_)
results = results.sort_values(by=['rank_test_score'])
results.to_csv('results')

print('GRID 2')
grid2 = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, scoring='recall')
grid2.fit(X_train, y_train)
best_clf_2 = grid2.best_estimator_
best_clf_2.fit(X_train, y_train)
pred_rfc_2 = best_clf_2.predict(X_valid)
print('rfc2', classification_report(y_valid, pred_rfc_2))
print(confusion_matrix(y_valid, pred_rfc_2))

print('ENSEMBLE')
vc = VotingClassifier([('clf1', best_clf), ('clf2', best_clf_2)], voting='soft')
best_clf_3 = vc.fit(X_train, y_train)
pred_rfc_3 = best_clf_3.predict(X_valid)
print('rfc3', classification_report(y_valid, pred_rfc_3))
print(confusion_matrix(y_valid, pred_rfc_3))


90 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ethie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ethie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\ethie\AppData\Local\Packages\PythonSoftwareFoundation.Pyth

{'classifier': SVC(C=10, kernel='sigmoid', max_iter=1000, probability=True, random_state=1), 'classifier__C': 10, 'classifier__class_weight': None, 'classifier__kernel': 'sigmoid', 'pca__n_components': 19}
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('variance', VarianceThreshold(threshold=0)),
                ('pca', PCA(n_components=19)),
                ('classifier',
                 SVC(C=10, kernel='sigmoid', max_iter=1000, probability=True,
                     random_state=1))])
0.7303030303030302
Test Precision: 0.6153846153846154
Test Recall: 0.8888888888888888
Test ROC AUC Score: 0.7777777777777778
rfc               precision    recall  f1-score   support

           0       0.67      0.91      0.77        11
           1       0.89      0.62      0.73        13

    accuracy                           0.75        24
   macro avg       0.78      0.76      0.75        24
weighted avg       0.79      0.75      0.75        24

[[10

90 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ethie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ethie\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\ethie\AppData\Local\Packages\PythonSoftwareFoundation.Pyth