In [48]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler


In [49]:
#read dataset
df = pd.read_csv('../Data/originalData/brain_tumor_dataset.csv')
df

Unnamed: 0,Patient_ID,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,4,29,Male,Malignant,1.436600,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,21,Male,Malignant,9.612013,Parietal,Medulloblastoma,III,Seizures,Nausea,Vision Issues,No,No,Yes,58.229662,0.353806,No,Negative,Yes
19996,19997,32,Female,Benign,1.543560,Temporal,Meningioma,III,Headache,Headache,Vision Issues,Yes,Yes,No,77.706856,2.341074,No,Positive,No
19997,19998,57,Female,Benign,3.618634,Temporal,Medulloblastoma,I,Seizures,Vision Issues,Nausea,No,No,Yes,89.543803,2.332881,No,Positive,Yes
19998,19999,68,Male,Malignant,8.519086,Parietal,Glioblastoma,III,Seizures,Headache,Vision Issues,Yes,Yes,Yes,83.306781,2.387202,No,Positive,No


## Remove Cols

In [50]:
# we dont need patent ID since it's just and index
df = df.drop(columns=['Patient_ID'])
df

Unnamed: 0,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,29,Male,Malignant,1.436600,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,21,Male,Malignant,9.612013,Parietal,Medulloblastoma,III,Seizures,Nausea,Vision Issues,No,No,Yes,58.229662,0.353806,No,Negative,Yes
19996,32,Female,Benign,1.543560,Temporal,Meningioma,III,Headache,Headache,Vision Issues,Yes,Yes,No,77.706856,2.341074,No,Positive,No
19997,57,Female,Benign,3.618634,Temporal,Medulloblastoma,I,Seizures,Vision Issues,Nausea,No,No,Yes,89.543803,2.332881,No,Positive,Yes
19998,68,Male,Malignant,8.519086,Parietal,Glioblastoma,III,Seizures,Headache,Vision Issues,Yes,Yes,Yes,83.306781,2.387202,No,Positive,No


In [51]:
## check nulls one more time
df.isna().sum()

Age                    0
Gender                 0
Tumor_Type             0
Tumor_Size             0
Location               0
Histology              0
Stage                  0
Symptom_1              0
Symptom_2              0
Symptom_3              0
Radiation_Treatment    0
Surgery_Performed      0
Chemotherapy           0
Survival_Rate          0
Tumor_Growth_Rate      0
Family_History         0
MRI_Result             0
Follow_Up_Required     0
dtype: int64

## Encode values

In [52]:
# count non num columns
non_num_col = df.select_dtypes(include=['object']).columns

# print the number of string columns
print(len(non_num_col))

#print the non num col
non_num_col


14


Index(['Gender', 'Tumor_Type', 'Location', 'Histology', 'Stage', 'Symptom_1',
       'Symptom_2', 'Symptom_3', 'Radiation_Treatment', 'Surgery_Performed',
       'Chemotherapy', 'Family_History', 'MRI_Result', 'Follow_Up_Required'],
      dtype='object')

we want to one hot encode non ordered categories and label encode the rest like 

In [53]:
# encode cols
df = pd.get_dummies(df, columns=[
    'Gender', 'Location', 'Histology', 
    'Symptom_1', 'Symptom_2', 'Symptom_3',
    'Radiation_Treatment', 'Surgery_Performed', 
    'Chemotherapy', 'Family_History',
], drop_first=True)
df

Unnamed: 0,Age,Tumor_Type,Tumor_Size,Stage,Survival_Rate,Tumor_Growth_Rate,MRI_Result,Follow_Up_Required,Gender_Male,Location_Occipital,...,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues,Radiation_Treatment_Yes,Surgery_Performed_Yes,Chemotherapy_Yes,Family_History_Yes
0,73,Malignant,5.375612,III,51.312579,0.111876,Positive,Yes,True,False,...,False,True,False,False,True,False,False,False,False,False
1,26,Benign,4.847098,II,46.373273,2.165736,Positive,Yes,True,False,...,False,False,False,True,False,False,True,True,True,True
2,31,Benign,5.588391,I,47.072221,1.884228,Negative,No,True,False,...,False,False,False,False,True,False,False,False,False,False
3,29,Malignant,1.436600,IV,51.853634,1.283342,Negative,No,True,False,...,False,True,False,False,False,False,True,False,True,True
4,54,Benign,2.417506,I,54.708987,2.069477,Positive,Yes,False,False,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,21,Malignant,9.612013,III,58.229662,0.353806,Negative,Yes,True,False,...,True,False,False,False,False,True,False,False,True,False
19996,32,Benign,1.543560,III,77.706856,2.341074,Positive,No,False,False,...,False,False,False,False,False,True,True,True,False,False
19997,57,Benign,3.618634,I,89.543803,2.332881,Positive,Yes,False,False,...,False,False,True,True,False,False,False,False,True,False
19998,68,Malignant,8.519086,III,83.306781,2.387202,Positive,No,True,False,...,False,False,False,False,False,True,True,True,True,False


In [54]:
# label encoding for the rest
binary_cols = ['Tumor_Type', 'MRI_Result', 'Follow_Up_Required']

for col in binary_cols:
    df[col] = df[col].map({'Yes':1, 'No':0, 'Positive':1, 'Negative':0, 'Malignant':1, 'Benign':0})

# Stage: I → 1, II → 2, III → 3, IV → 4
stage_mapping = {'I':1, 'II':2, 'III':3, 'IV':4}
df['Stage'] = df['Stage'].map(stage_mapping)


In [55]:
df

Unnamed: 0,Age,Tumor_Type,Tumor_Size,Stage,Survival_Rate,Tumor_Growth_Rate,MRI_Result,Follow_Up_Required,Gender_Male,Location_Occipital,...,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues,Radiation_Treatment_Yes,Surgery_Performed_Yes,Chemotherapy_Yes,Family_History_Yes
0,73,1,5.375612,3,51.312579,0.111876,1,1,True,False,...,False,True,False,False,True,False,False,False,False,False
1,26,0,4.847098,2,46.373273,2.165736,1,1,True,False,...,False,False,False,True,False,False,True,True,True,True
2,31,0,5.588391,1,47.072221,1.884228,0,0,True,False,...,False,False,False,False,True,False,False,False,False,False
3,29,1,1.436600,4,51.853634,1.283342,0,0,True,False,...,False,True,False,False,False,False,True,False,True,True
4,54,0,2.417506,1,54.708987,2.069477,1,1,False,False,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,21,1,9.612013,3,58.229662,0.353806,0,1,True,False,...,True,False,False,False,False,True,False,False,True,False
19996,32,0,1.543560,3,77.706856,2.341074,1,0,False,False,...,False,False,False,False,False,True,True,True,False,False
19997,57,0,3.618634,1,89.543803,2.332881,1,1,False,False,...,False,False,True,True,False,False,False,False,True,False
19998,68,1,8.519086,3,83.306781,2.387202,1,0,True,False,...,False,False,False,False,False,True,True,True,True,False


## Feature Scaling and normalization

In [56]:
#initiate scaler
scaler = StandardScaler()


In [57]:
# scale numerocal columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df


Unnamed: 0,Age,Tumor_Type,Tumor_Size,Stage,Survival_Rate,Tumor_Growth_Rate,MRI_Result,Follow_Up_Required,Gender_Male,Location_Occipital,...,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues,Radiation_Treatment_Yes,Surgery_Performed_Yes,Chemotherapy_Yes,Family_History_Yes
0,1.355253,0.997004,0.050488,0.452389,-1.089675,-1.717548,0.997104,0.989950,True,False,...,False,True,False,False,True,False,False,False,False,False
1,-1.347627,-1.003005,-0.141399,-0.443697,-1.375673,0.739298,0.997104,0.989950,True,False,...,False,False,False,True,False,False,True,True,True,True
2,-1.060087,-1.003005,0.127742,-1.339783,-1.335202,0.402556,-1.002904,-1.010152,True,False,...,False,False,False,False,True,False,False,False,False,False
3,-1.175103,0.997004,-1.379648,1.348475,-1.058346,-0.316229,-1.002904,-1.010152,True,False,...,False,True,False,False,False,False,True,False,True,True
4,0.262599,-1.003005,-1.023511,-1.339783,-0.893014,0.624153,0.997104,0.989950,False,False,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-1.635167,0.997004,1.588598,0.452389,-0.689158,-1.428149,-1.002904,0.989950,True,False,...,True,False,False,False,False,True,False,False,True,False
19996,-1.002579,-1.003005,-1.340814,0.452389,0.438621,0.949040,0.997104,-1.010152,False,False,...,False,False,False,False,False,True,True,True,False,False
19997,0.435123,-1.003005,-0.587417,-1.339783,1.124010,0.939239,0.997104,0.989950,False,False,...,False,False,True,True,False,False,False,False,True,False
19998,1.067712,0.997004,1.191789,0.452389,0.762871,1.004218,0.997104,-1.010152,True,False,...,False,False,False,False,False,True,True,True,True,False


In [58]:
# convert all t/f to 1/0
df = df.astype({col: int for col in df.columns if df[col].dtype == 'bool'})
df

Unnamed: 0,Age,Tumor_Type,Tumor_Size,Stage,Survival_Rate,Tumor_Growth_Rate,MRI_Result,Follow_Up_Required,Gender_Male,Location_Occipital,...,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues,Radiation_Treatment_Yes,Surgery_Performed_Yes,Chemotherapy_Yes,Family_History_Yes
0,1.355253,0.997004,0.050488,0.452389,-1.089675,-1.717548,0.997104,0.989950,1,0,...,0,1,0,0,1,0,0,0,0,0
1,-1.347627,-1.003005,-0.141399,-0.443697,-1.375673,0.739298,0.997104,0.989950,1,0,...,0,0,0,1,0,0,1,1,1,1
2,-1.060087,-1.003005,0.127742,-1.339783,-1.335202,0.402556,-1.002904,-1.010152,1,0,...,0,0,0,0,1,0,0,0,0,0
3,-1.175103,0.997004,-1.379648,1.348475,-1.058346,-0.316229,-1.002904,-1.010152,1,0,...,0,1,0,0,0,0,1,0,1,1
4,0.262599,-1.003005,-1.023511,-1.339783,-0.893014,0.624153,0.997104,0.989950,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-1.635167,0.997004,1.588598,0.452389,-0.689158,-1.428149,-1.002904,0.989950,1,0,...,1,0,0,0,0,1,0,0,1,0
19996,-1.002579,-1.003005,-1.340814,0.452389,0.438621,0.949040,0.997104,-1.010152,0,0,...,0,0,0,0,0,1,1,1,0,0
19997,0.435123,-1.003005,-0.587417,-1.339783,1.124010,0.939239,0.997104,0.989950,0,0,...,0,0,1,1,0,0,0,0,1,0
19998,1.067712,0.997004,1.191789,0.452389,0.762871,1.004218,0.997104,-1.010152,1,0,...,0,0,0,0,0,1,1,1,1,0


In [60]:
df.to_csv('../Data/cleanData/data.csv', index = False)