# TM10007 Assignment template

In [1]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/karinvangarderen/tm10007_project.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [2]:
import numpy as np
from sklearn.impute import KNNImputer

from brats.load_data import load_data

In [3]:
# Load data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


The number of samples: 167
The number of columns: 725


In [4]:
# Splitting the data into GBM and LGG

GBM = data.loc[data['label'] == 'GBM']
LGG = data.loc[data['label'] == 'LGG']

# check if a feature has at least one value per label

empty_feature = []

for labels, content in GBM.iteritems():
    if content.count() == 0:
        empty_feature.append(labels)
        print(f'The feature {labels} does not have any values in the GBM dataset')

for labels, content in LGG.iteritems():
    if content.count() == 0:
        empty_feature.append(labels)
        print(f'The feature {labels} does not have any values in the LGG dataset')

# remove the features that have no data for at least one label

data_empty_removed = data.drop(columns=[feature for feature in set(empty_feature)])



The feature TGM_Cog_X_6 does not have any values in the GBM dataset
The feature TGM_Cog_Y_6 does not have any values in the GBM dataset
The feature TGM_Cog_Z_6 does not have any values in the GBM dataset
The feature TGM_T_6 does not have any values in the GBM dataset


In [5]:
# Determining the number of NaN row-wise and columnwise
# Column counting is done on the full data, and for GBM and LGG seperately
# It can be usefull to see from which the class the NaN's come from
# This should/might be taken into account when selecting the features

import pandas as pd
no_nan_row = data.isnull().sum(axis=1)
no_nan_col = data.isnull().sum(axis=0)
GBM_no_nan_col = GBM.isnull().sum(axis=0)
LGG_no_nan_col = LGG.isnull().sum(axis=0)

frame = { 'Total (n=167)': no_nan_col, 'GBM (n=102)': GBM_no_nan_col, 'LGG (n=65)': LGG_no_nan_col } 
overview_nan = pd.DataFrame(frame)

# The variables (series) below 'bins' the NaN's:
# - the index column is the amount of NaN's in the dataset 
# - the second column is the amount of features that have this amount of NaN's

aantal_NAN_GBM = GBM_no_nan_col.value_counts()
aantal_NAN_LGG = LGG_no_nan_col.value_counts()
aantal_NAN_total = no_nan_col.value_counts()


In [6]:
# Imputation of the GBM dataset

imputer = KNNImputer(n_neighbors=5, weights="uniform")

GBM_imputed = data_empty_removed.loc[data_empty_removed['label'] == 'GBM'].drop(columns=['label'])
array_imp_GBM = imputer.fit_transform(GBM_imputed)
GBM_imputed[:] = array_imp_GBM
GBM_imputed['label'] = 'GBM'


# Imputation of the LGG dataset

LGG_imputed = data_empty_removed.loc[data_empty_removed['label'] == 'LGG'].drop(columns=['label'])
array_imp_LGG = imputer.fit_transform(LGG_imputed)
LGG_imputed[:] = array_imp_LGG
LGG_imputed['label'] = 'LGG'

# Combining the datasets again

data_imputed = GBM_imputed
data_imputed = data_imputed.append(LGG_imputed)