# TM10007 Assignment template

In [39]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/karinvangarderen/tm10007_project.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [40]:
import numpy as np
import pandas as pd
from math import floor
from sklearn.impute import KNNImputer

from brats.load_data import load_data

In [41]:
# Load data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

# Splitting the data into GBM and LGG
GBM = data.loc[data['label'] == 'GBM']
LGG = data.loc[data['label'] == 'LGG']


The number of samples: 167
The number of columns: 725


In [42]:
""" Overview of the NaN's in the dataset"""

# Determining the number of NaN's
NO_NAN_ROW_TOTAL = data.isnull().sum(axis=1)          # Number of NaN's per patient for GBM and LGG patients
NO_NAN_COL_TOTAL = data.isnull().sum(axis=0)          # Number of NaN's per feature for GBM and LGG patients
GBM_NO_NAN_COL = GBM.isnull().sum(axis=0)             # Number of NaN's per feature for GBM patients
LGG_NO_NAN_COL = LGG.isnull().sum(axis=0)             # Number of NaN's per feature for LGG patients
OVERVIEW_NAN = { 'Total': NO_NAN_COL_TOTAL, 'GBM': GBM_NO_NAN_COL, 'LGG': LGG_NO_NAN_COL } 


In [43]:
"""     Feature selection based on the number of NaN's
        Threshold = the maximum number of NaN's in a column    """

# Define percentage of patients with no data for a certain feature, above which the feature is discarded
PERC_MISSING_GBM = 5
PERC_MISSING_LGG = 5

# Determining threshold for discarding feature based on above percentage
THRESHOLD_GBM = floor((PERC_MISSING_GBM/100) * len(GBM.index))
THRESHOLD_LGG = floor((PERC_MISSING_LGG/100) * len(LGG.index))

# Initialisation
FEATURES_REMOVED = []

# Append names of features that should be discarded to list

for feature in GBM_NO_NAN_COL[GBM_NO_NAN_COL > THRESHOLD_GBM].index[:]:
    FEATURES_REMOVED.append(feature)

for feature in LGG_NO_NAN_COL[LGG_NO_NAN_COL > THRESHOLD_LGG].index[:]:
    FEATURES_REMOVED.append(feature)

# Remove features from dataset
DATA_REMOVED = data.drop(columns=[features for features in set(FEATURES_REMOVED)])


# The variables (series) below 'bins' the NaN's:
# - the index column is the amount of NaN's in the dataset 
# - the second column is the amount of features that have this amount of NaN's

#aantal_NAN_GBM = GBM_no_nan_col.value_counts()
#aantal_NAN_LGG = LGG_no_nan_col.value_counts()
#aantal_NAN_total = no_nan_col.value_counts()


In [44]:
# kNN imputation of missing values

imputer = KNNImputer(n_neighbors=5, weights="uniform")

# Imputation of the GBM dataset
GBM_IMPUTED = DATA_REMOVED.loc[DATA_REMOVED['label'] == 'GBM']                      # Select GBM patients 
GBM_IMPUTED = GBM_IMPUTED.drop(columns=['label'])                                   # Drop label column for imputation
ARRAY_IMP_GBM = imputer.fit_transform(GBM_IMPUTED)                                        
GBM_IMPUTED[:] = ARRAY_IMP_GBM                                                      # Overwrite original values with imputed values in dataframe
GBM_IMPUTED['label'] = 'GBM'                                                        # Add column containing label

# Imputation of the LGG dataset
LGG_IMPUTED = DATA_REMOVED.loc[DATA_REMOVED['label'] == 'LGG']                      # Select LGG patients
LGG_IMPUTED = LGG_IMPUTED.drop(columns=['label'])                                   # Drop label column for imputation 
ARRAY_IMP_LGG = imputer.fit_transform(LGG_IMPUTED)                                  
LGG_IMPUTED[:] = ARRAY_IMP_LGG                                                      # Overwrite original values with imputed values in dataframe
LGG_IMPUTED['label'] = 'LGG'                                                        # Add column containing label

# Combine imputed GBM and LGG dataframes into one dataframe
DATA_IMPUTED = GBM_IMPUTED
DATA_IMPUTED = DATA_IMPUTED.append(LGG_IMPUTED)