## Preprocess

In [164]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [165]:
#read dataset
df = pd.read_csv('../Data/originalData/brain_tumor_dataset.csv')
df

Unnamed: 0,Patient_ID,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,4,29,Male,Malignant,1.436600,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,21,Male,Malignant,9.612013,Parietal,Medulloblastoma,III,Seizures,Nausea,Vision Issues,No,No,Yes,58.229662,0.353806,No,Negative,Yes
19996,19997,32,Female,Benign,1.543560,Temporal,Meningioma,III,Headache,Headache,Vision Issues,Yes,Yes,No,77.706856,2.341074,No,Positive,No
19997,19998,57,Female,Benign,3.618634,Temporal,Medulloblastoma,I,Seizures,Vision Issues,Nausea,No,No,Yes,89.543803,2.332881,No,Positive,Yes
19998,19999,68,Male,Malignant,8.519086,Parietal,Glioblastoma,III,Seizures,Headache,Vision Issues,Yes,Yes,Yes,83.306781,2.387202,No,Positive,No


## Remove Cols

In [166]:
# we dont need patent ID since it's just and index
df = df.drop(columns=['Patient_ID'])
df

Unnamed: 0,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,29,Male,Malignant,1.436600,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,21,Male,Malignant,9.612013,Parietal,Medulloblastoma,III,Seizures,Nausea,Vision Issues,No,No,Yes,58.229662,0.353806,No,Negative,Yes
19996,32,Female,Benign,1.543560,Temporal,Meningioma,III,Headache,Headache,Vision Issues,Yes,Yes,No,77.706856,2.341074,No,Positive,No
19997,57,Female,Benign,3.618634,Temporal,Medulloblastoma,I,Seizures,Vision Issues,Nausea,No,No,Yes,89.543803,2.332881,No,Positive,Yes
19998,68,Male,Malignant,8.519086,Parietal,Glioblastoma,III,Seizures,Headache,Vision Issues,Yes,Yes,Yes,83.306781,2.387202,No,Positive,No


In [167]:
## check nulls one more time
df.isna().sum()

Age                    0
Gender                 0
Tumor_Type             0
Tumor_Size             0
Location               0
Histology              0
Stage                  0
Symptom_1              0
Symptom_2              0
Symptom_3              0
Radiation_Treatment    0
Surgery_Performed      0
Chemotherapy           0
Survival_Rate          0
Tumor_Growth_Rate      0
Family_History         0
MRI_Result             0
Follow_Up_Required     0
dtype: int64

## Separate Target/label


In [168]:
y = df['Tumor_Type']
X = df.drop('Tumor_Type', axis=1)

In [169]:
# prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Scale

In [170]:
scaler = StandardScaler()
# num columns to scale
numeric_cols = ['Age', 'Tumor_Size', 'Survival_Rate', 'Tumor_Growth_Rate']


In [171]:
# fit only on training data
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

# test data using the same scaler
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [172]:
# bools to int
# map Yes/No and pos/neg to 1/0
mapping = {'Yes': 1, 'No': 0, 'Positive': 1, 'Negative': 0}

# apply to both train and test
X_train[bool_cols] = X_train[bool_cols].replace(mapping).infer_objects(copy=False)
X_test[bool_cols] = X_test[bool_cols].replace(mapping).infer_objects(copy=False)

# amke integers
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)


  X_train[bool_cols] = X_train[bool_cols].replace(mapping).infer_objects(copy=False)
  X_test[bool_cols] = X_test[bool_cols].replace(mapping).infer_objects(copy=False)


In [173]:
X_train

Unnamed: 0,Age,Gender,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
5894,-0.535442,Female,-0.585606,Occipital,Glioblastoma,III,Headache,Nausea,Seizures,No,Yes,No,0.202357,0.108763,No,1,1
3728,-0.420672,Female,0.302410,Frontal,Glioblastoma,III,Seizures,Nausea,Seizures,No,No,No,-1.530538,0.310344,Yes,0,0
8958,-0.191132,Male,0.259954,Temporal,Medulloblastoma,IV,Nausea,Vision Issues,Seizures,No,No,No,-0.366452,0.801447,No,0,0
7671,1.128725,Female,-1.314975,Parietal,Astrocytoma,IV,Seizures,Seizures,Vision Issues,No,Yes,No,0.381126,-0.687134,Yes,0,0
5999,-0.133747,Male,0.848646,Frontal,Meningioma,I,Seizures,Nausea,Headache,Yes,Yes,No,0.904089,0.844879,Yes,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,0.210564,Female,1.209881,Frontal,Meningioma,I,Headache,Vision Issues,Seizures,No,Yes,Yes,-0.323033,0.303367,No,0,0
11964,-1.109293,Female,1.198305,Occipital,Glioblastoma,IV,Headache,Vision Issues,Headache,No,Yes,No,-0.594213,1.567131,No,0,0
5390,-0.592827,Female,-0.268596,Frontal,Glioblastoma,I,Seizures,Headache,Headache,Yes,No,No,0.793494,1.352715,No,0,1
860,-0.650212,Male,-1.528459,Temporal,Meningioma,I,Vision Issues,Headache,Vision Issues,No,No,Yes,1.603353,-0.328701,No,0,0


## Encode values

In [174]:
# count non num columns
non_num_col = df.select_dtypes(include=['object']).columns

# print the number of string columns
print(len(non_num_col))

#print the non num col
non_num_col


14


Index(['Gender', 'Tumor_Type', 'Location', 'Histology', 'Stage', 'Symptom_1',
       'Symptom_2', 'Symptom_3', 'Radiation_Treatment', 'Surgery_Performed',
       'Chemotherapy', 'Family_History', 'MRI_Result', 'Follow_Up_Required'],
      dtype='object')

we want to one hot encode non ordered categories and label encode the rest like 

In [175]:
# columns to one-hot encode
categorical_cols = ['Gender', 'Location', 'Histology', 'Symptom_1', 'Symptom_2', 'Symptom_3']

# apply one-hot encoding
X_train = pd.get_dummies(X_train, columns=categorical_cols)
X_test = pd.get_dummies(X_test, columns=categorical_cols)


# make sure to align columns so that train and test have the same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)


In [176]:
X_train

Unnamed: 0,Age,Tumor_Size,Stage,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,...,Symptom_1_Seizures,Symptom_1_Vision Issues,Symptom_2_Headache,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Headache,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues
5894,-0.535442,-0.585606,III,No,Yes,No,0.202357,0.108763,No,1,...,False,False,False,True,False,False,False,False,True,False
3728,-0.420672,0.302410,III,No,No,No,-1.530538,0.310344,Yes,0,...,True,False,False,True,False,False,False,False,True,False
8958,-0.191132,0.259954,IV,No,No,No,-0.366452,0.801447,No,0,...,False,False,False,False,False,True,False,False,True,False
7671,1.128725,-1.314975,IV,No,Yes,No,0.381126,-0.687134,Yes,0,...,True,False,False,False,True,False,False,False,False,True
5999,-0.133747,0.848646,I,Yes,Yes,No,0.904089,0.844879,Yes,0,...,True,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,0.210564,1.209881,I,No,Yes,Yes,-0.323033,0.303367,No,0,...,False,False,False,False,False,True,False,False,True,False
11964,-1.109293,1.198305,IV,No,Yes,No,-0.594213,1.567131,No,0,...,False,False,False,False,False,True,True,False,False,False
5390,-0.592827,-0.268596,I,Yes,No,No,0.793494,1.352715,No,0,...,True,False,True,False,False,False,True,False,False,False
860,-0.650212,-1.528459,I,No,No,Yes,1.603353,-0.328701,No,0,...,False,True,True,False,False,False,False,False,False,True


In [178]:
# map Stage from Roman numerals to integers
stage_mapping = {'I': 1, 'II': 2, 'III': 3, 'IV': 4}
X_train['Stage'] = X_train['Stage'].map(stage_mapping)
X_test['Stage'] = X_test['Stage'].map(stage_mapping)


In [179]:
X_train

Unnamed: 0,Age,Tumor_Size,Stage,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,...,Symptom_1_Seizures,Symptom_1_Vision Issues,Symptom_2_Headache,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Headache,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues
5894,-0.535442,-0.585606,3,No,Yes,No,0.202357,0.108763,No,1,...,False,False,False,True,False,False,False,False,True,False
3728,-0.420672,0.302410,3,No,No,No,-1.530538,0.310344,Yes,0,...,True,False,False,True,False,False,False,False,True,False
8958,-0.191132,0.259954,4,No,No,No,-0.366452,0.801447,No,0,...,False,False,False,False,False,True,False,False,True,False
7671,1.128725,-1.314975,4,No,Yes,No,0.381126,-0.687134,Yes,0,...,True,False,False,False,True,False,False,False,False,True
5999,-0.133747,0.848646,1,Yes,Yes,No,0.904089,0.844879,Yes,0,...,True,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,0.210564,1.209881,1,No,Yes,Yes,-0.323033,0.303367,No,0,...,False,False,False,False,False,True,False,False,True,False
11964,-1.109293,1.198305,4,No,Yes,No,-0.594213,1.567131,No,0,...,False,False,False,False,False,True,True,False,False,False
5390,-0.592827,-0.268596,1,Yes,No,No,0.793494,1.352715,No,0,...,True,False,True,False,False,False,True,False,False,False
860,-0.650212,-1.528459,1,No,No,Yes,1.603353,-0.328701,No,0,...,False,True,True,False,False,False,False,False,False,True


In [180]:
# Save X_train and X_test
X_train.to_csv('../Data/CleanData/X_train.csv', index=False)
X_test.to_csv('../Data/CleanData/X_test.csv', index=False)

# Save y_train and y_test
y_train.to_csv('../Data/CleanData/y_train.csv', index=False)
y_test.to_csv('../Data/CleanData/y_test.csv', index=False)
