# TM10007 Assignment template

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [18]:
# import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from worclipo.load_data import load_data

### functions

In [19]:
# function to split the dataset into train and test
def split_set(X,y,test_size):

    if os.path.exists('./TEST_set.csv'):
        split_action = print('TEST_set.csv already exists')
    else:
        split_action = print('TEST_set.csv does not exist, generating new test and training sets')
        X_train_csv, X_test_csv, y_train_csv, y_test_csv = train_test_split(X, y, test_size=test_size, random_state=10)

        TESTSET = X_test_csv.merge(y_test_csv, left_index=True, right_index=True)
        TESTSET.to_csv('TEST_set.csv')

        TRAINSET = X_train_csv.merge(y_train_csv, left_index=True, right_index=True)
        TRAINSET.to_csv('TRAIN_set.csv')
        return split_action
    

# setting up the data to be processed

In [20]:
data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
print(type(data))

# change lipoma = 1 and liposarcoma = 0 and encode labels
group_names = list(set(data.label))
data.loc[data['label'] == 'lipoma', 'label'] = 1
data.loc[data['label'] == 'liposarcoma', 'label'] = 0
data['label'] = pd.cut(data['label'], bins = 2, labels=group_names)
print(data['label'].unique())
label_diag = LabelEncoder()
data['label'] = label_diag.fit_transform(data['label'])

# assign X to measurements and y to outcome (lipoma/sarcoma)
X = data.drop('label', axis=1)
y = data['label']
test_size = 0.3

The number of samples: 115
The number of columns: 494
<class 'pandas.core.frame.DataFrame'>
['liposarcoma', 'lipoma']
Categories (2, object): ['liposarcoma' < 'lipoma']


In [21]:
# code that splits the data into test and validation sets if this is not done already
split_set(X,y,test_size)

TEST_set.csv already exists


## import the training set

In [23]:
TRAIN = pd.read_csv('TRAIN_set.csv', index_col=0)
X_train = TRAIN.drop('label', axis=1)
y_train = TRAIN['label']

TRAIN['label'] = pd.cut(TRAIN['label'], bins = 2, labels=group_names)
print(TRAIN['label'].unique())
label_diag = LabelEncoder()
TRAIN['label'] = label_diag.fit_transform(TRAIN['label'])

# split into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=10)

from sklearn.pipeline import Pipeline

print(f'Size before removal zero variances: ', X_train.shape)
pipe_preprocess = Pipeline([
    ("check nan", SimpleImputer(missing_values=np.nan, strategy='mean')),
    ("scale", StandardScaler()),
    ("variance", VarianceThreshold(threshold=0))
])

X_train = pipe_preprocess.fit_transform(X_train)
X_valid = pipe_preprocess.transform(X_valid)
print(f'Size after removal zero variance: ', X_train.shape)



['liposarcoma', 'lipoma']
Categories (2, object): ['liposarcoma' < 'lipoma']
Size before removal zero variances:  (56, 493)
Size after removal zero variance:  (56, 474)
[[ 5.08835245e-01 -4.01195997e-01 -9.64928150e-01 ... -6.42103454e-01
   8.19756814e-01 -1.51026524e+00]
 [ 1.10985503e+00 -1.05314931e+00  1.57770125e+00 ... -1.84879419e-01
  -8.09784196e-01  7.59348456e-01]
 [ 7.27813145e-01 -1.04048580e+00 -9.75735731e-01 ... -5.45729694e-01
   6.90270135e-01 -8.67337800e-01]
 ...
 [-1.56661213e+00  1.49742162e+00  1.46879587e+00 ...  2.56633483e+00
  -1.36385565e-01  1.71512577e+00]
 [ 1.17039088e+00 -5.97218646e-01 -1.35640512e+00 ... -6.95018547e-01
  -6.25831253e-01 -3.17769751e+00]
 [ 2.85698868e-01  8.36518076e-01  8.90420545e-02 ... -5.41305012e-01
  -9.61163555e-01 -2.84981725e-03]]


# Classify