# TM10007 Assignment template

In [1]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/karinvangarderen/tm10007_project.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [2]:
import numpy as np
from sklearn.impute import KNNImputer

from brats.load_data import load_data

In [3]:
# Load data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


The number of samples: 167
The number of columns: 725


In [4]:
"""     Method A: selection based on the number of values that you DO have
        --> Threshold = number of required values that you want             """

# Splitting the data into GBM and LGG

GBM = data.loc[data['label'] == 'GBM']
LGG = data.loc[data['label'] == 'LGG']

# check if a feature has at least {threshold} value per label

features_count_removed = []

threshold_GBM = 15
threshold_LGG = 15

for labels, content in GBM.iteritems():
    if content.count() < threshold_GBM:
        features_count_removed.append(labels)
        #print(f'The feature {labels} does not have enough values in the GBM dataset')

for labels, content in LGG.iteritems():
    if content.count() < threshold_LGG:
        features_count_removed.append(labels)
        #print(f'The feature {labels} does not have enough values in the LGG dataset')

# remove the features that have no data for at least one label

data_A_removed = data.drop(columns=[feature for feature in set(features_count_removed)])



In [5]:
"""     Method B: selection based on the number of NaN's
        --> Threshold = the maximum number of NaN's in a column    """

# Determining the number of NaN row-wise and columnwise
# Column counting is done on the full data, and for GBM and LGG seperately
# It can be usefull to see from which the class the NaN's come from
# This should/might be taken into account when selecting the features

import pandas as pd
no_nan_row = data.isnull().sum(axis=1)
no_nan_col = data.isnull().sum(axis=0)
GBM_no_nan_col = GBM.isnull().sum(axis=0)
LGG_no_nan_col = LGG.isnull().sum(axis=0)

frame = { 'Total': no_nan_col, 'GBM': GBM_no_nan_col, 'LGG': LGG_no_nan_col } 
overview_nan = pd.DataFrame(frame)

threshold_GBM = 10
threshold_LGG = 10
features_NAN_removed = []

for features in GBM_no_nan_col[GBM_no_nan_col > threshold_GBM].index[:]:
    features_NAN_removed.append(features)

for features in LGG_no_nan_col[LGG_no_nan_col > threshold_LGG].index[:]:
    features_NAN_removed.append(features)

data_B_removed = data.drop(columns=[feature for feature in set(features_NAN_removed)])


# The variables (series) below 'bins' the NaN's:
# - the index column is the amount of NaN's in the dataset 
# - the second column is the amount of features that have this amount of NaN's

aantal_NAN_GBM = GBM_no_nan_col.value_counts()
aantal_NAN_LGG = LGG_no_nan_col.value_counts()
aantal_NAN_total = no_nan_col.value_counts()


In [6]:
# For now, this section is done for the dataset that comes from method B.
# To change it to method A: simply change the variable data_B_removed to data_A_removed (4x)

# Imputation of the GBM dataset

imputer = KNNImputer(n_neighbors=5, weights="uniform")

GBM_imputed = data_A_removed.loc[data_A_removed['label'] == 'GBM'].drop(columns=['label'])
array_imp_GBM = imputer.fit_transform(GBM_imputed)
GBM_imputed[:] = array_imp_GBM
GBM_imputed['label'] = 'GBM'


# Imputation of the LGG dataset

LGG_imputed = data_A_removed.loc[data_A_removed['label'] == 'LGG'].drop(columns=['label'])
array_imp_LGG = imputer.fit_transform(LGG_imputed)
LGG_imputed[:] = array_imp_LGG
LGG_imputed['label'] = 'LGG'

# Combining the datasets again

data_imputed = GBM_imputed
data_imputed = data_imputed.append(LGG_imputed)