In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [None]:
#Open csv file.

data = pd.read_csv("/content/drive/MyDrive/NCDB-GBM/raw_data.csv", index_col='Unnamed: 0')
data.shape

In [None]:
#See all columns.

print(list(data.columns))

In [None]:
data = data[['AGE', 'SEX', 'RACE', 'SPANISH_HISPANIC_ORIGIN', 'INSURANCE_STATUS', 'FACILITY_TYPE_CD', 'FACILITY_LOCATION_CD', 'CDCC_TOTAL_BEST', 'TUMOR_SIZE_SUMMARY_2016', 'METHYLATION_O6MGMT', 'RX_SUMM_SURG_PRIM_SITE', 'NUMBER_PHASES_RAD_RX', 'RX_SUMM_CHEMO', 'RX_SUMM_IMMUNOTHERAPY', 'DX_LASTCONTACT_DEATH_MONTHS', 'PUF_VITAL_STATUS']]

In [None]:
#Manual label encoding.

data.loc[data['SEX'] == 'Male', 'SEX'] = 0
data.loc[data['SEX'] == 'Female', 'SEX'] = 1

data.loc[data['RACE'] == 'White', 'RACE'] = 0
data.loc[data['RACE'] == 'Black', 'RACE'] = 1
data.loc[data['RACE'] == 'Other', 'RACE'] = 2

data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 'No', 'SPANISH_HISPANIC_ORIGIN'] = 0
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 'Yes', 'SPANISH_HISPANIC_ORIGIN'] = 1

data.loc[data['INSURANCE_STATUS'] == 'Private insurance', 'INSURANCE_STATUS'] = 0
data.loc[data['INSURANCE_STATUS'] == 'Medicare', 'INSURANCE_STATUS'] = 1
data.loc[data['INSURANCE_STATUS'] == 'Medicaid', 'INSURANCE_STATUS'] = 2
data.loc[data['INSURANCE_STATUS'] == 'Other government', 'INSURANCE_STATUS'] = 3
data.loc[data['INSURANCE_STATUS'] == 'Not insured', 'INSURANCE_STATUS'] = 4

data.loc[data['FACILITY_TYPE_CD'] == 'Academic/Research Program', 'FACILITY_TYPE_CD'] = 0
data.loc[data['FACILITY_TYPE_CD'] == 'Community Cancer Program', 'FACILITY_TYPE_CD'] = 1
data.loc[data['FACILITY_TYPE_CD'] == 'Integrated Network Cancer Program', 'FACILITY_TYPE_CD'] = 2

data.loc[data['FACILITY_LOCATION_CD'] == 'Central', 'FACILITY_LOCATION_CD'] = 0
data.loc[data['FACILITY_LOCATION_CD'] == 'Atlantic', 'FACILITY_LOCATION_CD'] = 1
data.loc[data['FACILITY_LOCATION_CD'] == 'Pacific', 'FACILITY_LOCATION_CD'] = 2
data.loc[data['FACILITY_LOCATION_CD'] == 'New England', 'FACILITY_LOCATION_CD'] = 3
data.loc[data['FACILITY_LOCATION_CD'] == 'Mountain', 'FACILITY_LOCATION_CD'] = 4

data.loc[data['CDCC_TOTAL_BEST'] == '0', 'CDCC_TOTAL_BEST'] = 0
data.loc[data['CDCC_TOTAL_BEST'] == '1', 'CDCC_TOTAL_BEST'] = 1
data.loc[data['CDCC_TOTAL_BEST'] == '>2', 'CDCC_TOTAL_BEST'] = 2

data.loc[data['METHYLATION_O6MGMT'] == 'Unmethylated', 'METHYLATION_O6MGMT'] = 0
data.loc[data['METHYLATION_O6MGMT'] == 'Methylated', 'METHYLATION_O6MGMT'] = 1

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'No resective surgery was performed', 'RX_SUMM_SURG_PRIM_SITE'] = 0
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Gross total resection', 'RX_SUMM_SURG_PRIM_SITE'] = 1
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Subtotal resection', 'RX_SUMM_SURG_PRIM_SITE'] = 2

data.loc[data['NUMBER_PHASES_RAD_RX'] == 'No', 'NUMBER_PHASES_RAD_RX'] = 0
data.loc[data['NUMBER_PHASES_RAD_RX'] == 'Yes', 'NUMBER_PHASES_RAD_RX'] = 1

data.loc[data['RX_SUMM_CHEMO'] == 'No', 'RX_SUMM_CHEMO'] = 0
data.loc[data['RX_SUMM_CHEMO'] == 'Yes', 'RX_SUMM_CHEMO'] = 1

data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'No', 'RX_SUMM_IMMUNOTHERAPY'] = 0
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'Yes', 'RX_SUMM_IMMUNOTHERAPY'] = 1

data.loc[data['PUF_VITAL_STATUS'] == 'Alive', 'PUF_VITAL_STATUS'] = 0
data.loc[data['PUF_VITAL_STATUS'] == 'Dead', 'PUF_VITAL_STATUS'] = 1

In [None]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/NCDB-GBM/unimputed_data.csv')

In [None]:
#Impute lenghth of stay as '0' for non-surgical patients.

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 0, 'SURG_DISCHARGE_DAYS'] = 0

In [None]:
#Check missing values before imputation.

missing_cols = data.isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_cols, '\n')

missing_cols = pd.DataFrame(missing_cols)

missing_cols.columns = ['Value']

missing_cols = missing_cols[missing_cols['Value'] > 0]

print('Columns with missing values: ', list(missing_cols.index), '\n')

print('Number of columns with missing values: ', len(list(missing_cols.index)), '\n')

In [None]:
columns = data.columns

In [None]:
#Impute missing numerical values.

imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data = imputer.fit_transform(data)

data = pd.DataFrame(data)

data.columns = columns

data = data.applymap(round)

data.columns = columns


In [None]:
#Check missing values after imputation.

missing_cols = data.isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_cols, '\n')

missing_cols = pd.DataFrame(missing_cols)

missing_cols.columns = ['Value']

missing_cols = missing_cols[missing_cols['Value'] > 0]

print('Columns with missing values: ', list(missing_cols.index), '\n')

print('Number of columns with missing values: ', len(list(missing_cols.index)), '\n')

In [None]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/NCDB-GBM/imputed_data.csv')

In [None]:
# Reversing manual label encoding.

as_data = data.copy()

as_data.loc[as_data['SEX'] == 0, 'SEX'] = 'Male'
as_data.loc[as_data['SEX'] == 1, 'SEX'] = 'Female'

as_data.loc[as_data['RACE'] == 0, 'RACE'] = 'White'
as_data.loc[as_data['RACE'] == 1, 'RACE'] = 'Black'
as_data.loc[as_data['RACE'] == 2, 'RACE'] = 'Other'

as_data.loc[as_data['SPANISH_HISPANIC_ORIGIN'] == 0, 'SPANISH_HISPANIC_ORIGIN'] = 'No'
as_data.loc[as_data['SPANISH_HISPANIC_ORIGIN'] == 1, 'SPANISH_HISPANIC_ORIGIN'] = 'Yes'

as_data.loc[as_data['INSURANCE_STATUS'] == 0, 'INSURANCE_STATUS'] = 'Private insurance'
as_data.loc[as_data['INSURANCE_STATUS'] == 1, 'INSURANCE_STATUS'] = 'Medicare'
as_data.loc[as_data['INSURANCE_STATUS'] == 2, 'INSURANCE_STATUS'] = 'Medicaid'
as_data.loc[as_data['INSURANCE_STATUS'] == 3, 'INSURANCE_STATUS'] = 'Other government'
as_data.loc[as_data['INSURANCE_STATUS'] == 4, 'INSURANCE_STATUS'] = 'Not insured'

as_data.loc[as_data['FACILITY_TYPE_CD'] == 0, 'FACILITY_TYPE_CD'] = 'Academic/Research Program'
as_data.loc[as_data['FACILITY_TYPE_CD'] == 1, 'FACILITY_TYPE_CD'] = 'Community Cancer Program'
as_data.loc[as_data['FACILITY_TYPE_CD'] == 2, 'FACILITY_TYPE_CD'] = 'Integrated Network Cancer Program'

as_data.loc[as_data['FACILITY_LOCATION_CD'] == 0, 'FACILITY_LOCATION_CD'] = 'Central'
as_data.loc[as_data['FACILITY_LOCATION_CD'] == 1, 'FACILITY_LOCATION_CD'] = 'Atlantic'
as_data.loc[as_data['FACILITY_LOCATION_CD'] == 2, 'FACILITY_LOCATION_CD'] = 'Pacific'
as_data.loc[as_data['FACILITY_LOCATION_CD'] == 3, 'FACILITY_LOCATION_CD'] = 'New England'
as_data.loc[as_data['FACILITY_LOCATION_CD'] == 4, 'FACILITY_LOCATION_CD'] = 'Mountain'

as_data.loc[as_data['CDCC_TOTAL_BEST'] == 0, 'CDCC_TOTAL_BEST'] = '0'
as_data.loc[as_data['CDCC_TOTAL_BEST'] == 1, 'CDCC_TOTAL_BEST'] = '1'
as_data.loc[as_data['CDCC_TOTAL_BEST'] == 2, 'CDCC_TOTAL_BEST'] = '>2'

as_data.loc[as_data['METHYLATION_O6MGMT'] == 0, 'METHYLATION_O6MGMT'] = 'Unmethylated'
as_data.loc[as_data['METHYLATION_O6MGMT'] == 1, 'METHYLATION_O6MGMT'] = 'Methylated'

as_data.loc[as_data['RX_SUMM_SURG_PRIM_SITE'] == 0, 'RX_SUMM_SURG_PRIM_SITE'] = 'No resective surgery was performed'
as_data.loc[as_data['RX_SUMM_SURG_PRIM_SITE'] == 1, 'RX_SUMM_SURG_PRIM_SITE'] = 'Gross total resection'
as_data.loc[as_data['RX_SUMM_SURG_PRIM_SITE'] == 2, 'RX_SUMM_SURG_PRIM_SITE'] = 'Subtotal resection'

as_data.loc[as_data['NUMBER_PHASES_RAD_RX'] == 0, 'NUMBER_PHASES_RAD_RX'] = 'No'
as_data.loc[as_data['NUMBER_PHASES_RAD_RX'] == 1, 'NUMBER_PHASES_RAD_RX'] = 'Yes'

as_data.loc[as_data['RX_SUMM_CHEMO'] == 0, 'RX_SUMM_CHEMO'] = 'No'
as_data.loc[as_data['RX_SUMM_CHEMO'] == 1, 'RX_SUMM_CHEMO'] = 'Yes'

as_data.loc[as_data['RX_SUMM_IMMUNOTHERAPY'] == 0, 'RX_SUMM_IMMUNOTHERAPY'] = 'No'
as_data.loc[as_data['RX_SUMM_IMMUNOTHERAPY'] == 1, 'RX_SUMM_IMMUNOTHERAPY'] = 'Yes'

as_data.loc[as_data['PUF_VITAL_STATUS'] == 0, 'PUF_VITAL_STATUS'] = 'Alive'
as_data.loc[as_data['PUF_VITAL_STATUS'] == 1, 'PUF_VITAL_STATUS'] = 'Dead'

In [None]:
#Save AutoScore data.

data_dictionary = pd.read_csv("/content/drive/MyDrive/NCDB-GBM/Data_Dictionary.csv", encoding = 'latin1', index_col = None, low_memory = False)
FieldNames = dict(zip(data_dictionary['Variable'], data_dictionary['Field Name']))

as_data.columns = as_data.columns.map(FieldNames)

as_data.to_csv('/content/drive/MyDrive/NCDB-GBM/autoscore_data.csv')

In [None]:
#Change variable names to field names.

data.columns = data.columns.map(FieldNames)

In [None]:
#Save final data.

data.to_csv('/content/drive/MyDrive/NCDB-GBM/final_data.csv')

In [None]:
#Save data for Gradio.

gradio = data.copy()

gradio.columns = gradio.columns.str.replace(' ', '_', regex=True)
gradio.columns = gradio.columns.str.replace('-', '', regex=True)

gradio.to_csv('/content/drive/MyDrive/NCDB-GBM/gradio_data.csv')