In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [None]:
#Open csv file.

data = pd.read_csv("/content/drive/MyDrive/NCDB-PUFs/CNS - 2020 (Labeled).csv", index_col='PUF_CASE_ID')
data.shape

In [None]:
#See all columns.

print(list(data.columns))

In [None]:
#Define variables of interest (predictor variables, inclusion/exclusion criteria, outcomes of interest).

variables = ['DIAGNOSTIC_CONFIRMATION', 'FACILITY_TYPE_CD', 'FACILITY_LOCATION_CD', 'AGE', 'SEX', 'RACE', 'SPANISH_HISPANIC_ORIGIN', 'INSURANCE_STATUS', 'YEAR_OF_DIAGNOSIS', 'CDCC_TOTAL_BEST', 'HISTOLOGY', 'PRIMARY_SITE', 'LATERALITY', 'DIAGNOSTIC_BIOPSY', 'CS_SITESPECIFIC_FACTOR_1', 'CS_SITESPECIFIC_FACTOR_2', 'CS_SITESPECIFIC_FACTOR_3', 'CS_SITESPECIFIC_FACTOR_8', 'TUMOR_SIZE', 'RX_SUMM_SURG_PRIM_SITE', 'PHASE_I_COMBINED', 'PHASE_II_COMBINED', 'PHASE_III_COMBINED', 'DX_LASTCONTACT_DEATH_MONTHS', 'PUF_VITAL_STATUS']

In [None]:
#Remove unwanted columns and check data shape.

data = data[variables]

data.shape

#Inclusion Criteria

In [None]:
#Identify meningioma patients with ICD-O-3 codes.

icd_codes = [9530, 9531, 9532, 9533, 9534, 9535, 9537, 9538, 9539]
data = data[data[['HISTOLOGY']].isin(icd_codes).any(axis=1)]

print('Number of included patients:', data.shape[0])

In [None]:
#Confirm the histology codes with 'CS_SITESPECIFIC_FACTOR_1'.

data = data[(data['CS_SITESPECIFIC_FACTOR_1'] == 'Grade II') | (data['CS_SITESPECIFIC_FACTOR_1'] == 'Grade III')]

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for diagnostic confirmation.

data = data[data['DIAGNOSTIC_CONFIRMATION'] == 'Positive histology']
data = data.drop(['DIAGNOSTIC_CONFIRMATION'], axis=1)

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for 'PRIMARY_SITE'.

sites = ['C700', 'C709']
data = data[data[['PRIMARY_SITE']].isin(sites).any(axis=1)]
data = data.drop(['PRIMARY_SITE'], axis=1)

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for age.

data = data[data['AGE'] > 18]

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for the year of diagnosis.

years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
data = data[data[['YEAR_OF_DIAGNOSIS']].isin(years).any(axis=1)]
data = data.drop(['YEAR_OF_DIAGNOSIS'], axis=1)

print('Number of included patients:', data.shape[0])

#Exclusion Criteria

In [None]:
#Exclude patients without outcome information.

before = data.shape[0]
data = data[data['DX_LASTCONTACT_DEATH_MONTHS'].notna()]
data = data[(data['PUF_VITAL_STATUS'] != 'Unknown')]
after = data.shape[0]

excluded = before - after
print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

#Renaming and Merging Response Values

In [None]:
#Simplify the column 'HISTOLOGY'.

data.loc[data['HISTOLOGY'] == 9530, 'HISTOLOGY'] = 'Meningioma (NOS)'
data.loc[data['HISTOLOGY'] == 9531, 'HISTOLOGY'] = 'Meningothelial meningioma'
data.loc[data['HISTOLOGY'] == 9532, 'HISTOLOGY'] = 'Fibrous meningioma'
data.loc[data['HISTOLOGY'] == 9533, 'HISTOLOGY'] = 'Psammomatous meningioma'
data.loc[data['HISTOLOGY'] == 9534, 'HISTOLOGY'] = 'Angiomatous meningioma'
data.loc[data['HISTOLOGY'] == 9535, 'HISTOLOGY'] = 'Hemangioblastic meningioma'
data.loc[data['HISTOLOGY'] == 9537, 'HISTOLOGY'] = 'Transitional meningioma'
data.loc[data['HISTOLOGY'] == 9538, 'HISTOLOGY'] = 'Clear cell/rhabdoid meningioma'
data.loc[data['HISTOLOGY'] == 9539, 'HISTOLOGY'] = 'Atypical meningioma/meningeal sarcomatosis'

data['HISTOLOGY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'LATERALITY'.

data.loc[data['LATERALITY'] == 'Not applicable', 'LATERALITY'] = 'Unknown'

data['LATERALITY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'TUMOR_SIZE'.

data.loc[data['TUMOR_SIZE'] == 'Microscopic foci only', 'TUMOR_SIZE'] = 'Unknown'
data.loc[data['TUMOR_SIZE'] == 'No mass or tumor found', 'TUMOR_SIZE'] = 'Unknown'

data['TUMOR_SIZE'].value_counts(normalize=False, dropna=False)

In [None]:
#Create the column 'RESECTIVE_SURGERY'.

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Gross total resection', 'RESECTIVE_SURGERY'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Subtotal resection', 'RESECTIVE_SURGERY'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Surgery was performed but extent of resection is unknown', 'RESECTIVE_SURGERY'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Biopsy', 'RESECTIVE_SURGERY'] = 'No'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'No surgery was performed', 'RESECTIVE_SURGERY'] = 'No'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Gross total lobectomy', 'RESECTIVE_SURGERY'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Partial lobectomy', 'RESECTIVE_SURGERY'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Resection of tumor of spinal cord or nerve', 'RESECTIVE_SURGERY'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Tumor destruction', 'RESECTIVE_SURGERY'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Unknown', 'RESECTIVE_SURGERY'] = 'Unknown'

data['RESECTIVE_SURGERY'].value_counts(normalize=False, dropna=False)

In [None]:
#Create the column 'EXTENT_OF_RESECTION'.

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Gross total resection', 'EXTENT_OF_RESECTION'] = 'Gross total resection'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Subtotal resection', 'EXTENT_OF_RESECTION'] = 'Subtotal resection'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Surgery was performed but extent of resection is unknown', 'EXTENT_OF_RESECTION'] = 'Unknown'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Biopsy', 'EXTENT_OF_RESECTION'] = 'No resective surgery was performed'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'No surgery was performed', 'EXTENT_OF_RESECTION'] = 'No resective surgery was performed'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Gross total lobectomy', 'EXTENT_OF_RESECTION'] = 'Unknown'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Partial lobectomy', 'EXTENT_OF_RESECTION'] = 'Unknown'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Resection of tumor of spinal cord or nerve', 'EXTENT_OF_RESECTION'] = 'Unknown'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Tumor destruction', 'EXTENT_OF_RESECTION'] = 'Unknown'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Unknown', 'EXTENT_OF_RESECTION'] = 'Unknown'

data = data.drop(['RX_SUMM_SURG_PRIM_SITE'], axis=1)

data['EXTENT_OF_RESECTION'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'PHASE_I_COMBINED' in a new column 'PHASE_I_RADIOTHERAPY'.

data.loc[data['PHASE_I_COMBINED'] == 'No radiation treatment', 'PHASE_I_RADIOTHERAPY'] = 'No'
data.loc[data['PHASE_I_COMBINED'] == 'Unknown', 'PHASE_I_RADIOTHERAPY'] = 'Unknown'
data['PHASE_I_RADIOTHERAPY'] = data['PHASE_I_RADIOTHERAPY'].fillna('Yes')
data = data.drop(['PHASE_I_COMBINED'], axis=1)

data['PHASE_I_RADIOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'PHASE_II_COMBINED' in a new column 'PHASE_II_RADIOTHERAPY'.

data.loc[data['PHASE_II_COMBINED'] == 'No radiation treatment', 'PHASE_II_RADIOTHERAPY'] = 'No'
data.loc[data['PHASE_II_COMBINED'] == 'Unknown', 'PHASE_II_RADIOTHERAPY'] = 'Unknown'
data['PHASE_II_RADIOTHERAPY'] = data['PHASE_II_RADIOTHERAPY'].fillna('Yes')
data = data.drop(['PHASE_II_COMBINED'], axis=1)

data['PHASE_II_RADIOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'PHASE_III_COMBINED' in a new column 'PHASE_III_RADIOTHERAPY'.

data.loc[data['PHASE_III_COMBINED'] == 'No radiation treatment', 'PHASE_III_RADIOTHERAPY'] = 'No'
data.loc[data['PHASE_III_COMBINED'] == 'Unknown', 'PHASE_III_RADIOTHERAPY'] = 'Unknown'
data['PHASE_III_RADIOTHERAPY'] = data['PHASE_III_RADIOTHERAPY'].fillna('Yes')
data = data.drop(['PHASE_III_COMBINED'], axis=1)

data['PHASE_III_RADIOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Merge 'PHASE_I_RADIOTHERAPY', 'PHASE_II_RADIOTHERAPY', and 'PHASE_III_RADIOTHERAPY' in 'RADIATION_TREATMENT' column.

data.loc[(data['PHASE_I_RADIOTHERAPY'] == 'Yes') | (data['PHASE_II_RADIOTHERAPY'] == 'Yes') | (data['PHASE_III_RADIOTHERAPY'] == 'Yes') , 'RADIATION_TREATMENT'] = 'Yes'
data.loc[(data['PHASE_I_RADIOTHERAPY'] == 'No') & (data['PHASE_II_RADIOTHERAPY'] == 'No') & (data['PHASE_III_RADIOTHERAPY'] == 'No') , 'RADIATION_TREATMENT'] = 'No'
data['RADIATION_TREATMENT'] = data['RADIATION_TREATMENT'].fillna('Unknown')
data = data.drop(['PHASE_I_RADIOTHERAPY', 'PHASE_II_RADIOTHERAPY', 'PHASE_III_RADIOTHERAPY'], axis=1)

data['RADIATION_TREATMENT'].value_counts(normalize=False, dropna=False)

In [None]:
#Save filtered data.

data.to_csv('/content/drive/MyDrive/NCDB-Meningioma/data.csv')

#Imputation

In [None]:
#Define numerical and categorical columns.

num_cols = list(data.select_dtypes('number').columns)
print('Numerical columns: {}'.format(num_cols), '\n')

cat_cols = list(data.select_dtypes('object').columns)
print('Categorical columns: {}'.format(cat_cols))

In [None]:
#Remove outcomes.

num_cols_remove = ['DX_LASTCONTACT_DEATH_MONTHS']
cat_cols_remove = ['PUF_VITAL_STATUS']

num_cols = [i for i in num_cols if i not in num_cols_remove]
cat_cols = [i for i in cat_cols if i not in cat_cols_remove]

In [None]:
#Check missing values for numerical columns.

missing_num = data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_num, '\n')

missing_num = pd.DataFrame(missing_num)

missing_num.columns = ['Value']

missing_num = missing_num[missing_num['Value'] > 0]

print('Numerical variables with missing values: ', list(missing_num.index), '\n')

print('Number of numerical variables with missing values: ', len(list(missing_num.index)), '\n')

missing_num = missing_num[missing_num['Value'] > 25]

missing_num = list(missing_num.index)

print('Excluded numerical variables: ', missing_num)

In [None]:
#Drop numerical columns with missing values over 25%.

data.drop(missing_num, axis=1, inplace=True)

In [None]:
#Define new numerical columns.

num_cols = [x for x in num_cols if x not in missing_num]

In [None]:
#Impute missing numerical values.

num_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

In [None]:
#Check missing values for categorical columns.

missing_cat = data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_cat, '\n')

missing_cat = pd.DataFrame(missing_cat)

missing_cat.columns = ['Value']

missing_cat = missing_cat[missing_cat['Value'] > 0]

print('Categorical variables with missing values: ', list(missing_cat.index), '\n')

print('Number of categorical variables with missing values: ', len(list(missing_cat.index)), '\n')

missing_cat = missing_cat[missing_cat['Value'] > 25]

missing_cat = list(missing_cat.index)

print('Excluded categorical variables: ', missing_cat)

In [None]:
#Drop categorical columns with missing values over 25%.

data.drop(missing_cat, axis=1, inplace=True)

In [None]:
#Define new categorical columns.

cat_cols = [x for x in cat_cols if x not in missing_cat]

In [None]:
#Replace missing categorical values with 'Unknown'.

for col in cat_cols:
    data[col].fillna(value='Unknown', inplace=True)

In [None]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/NCDB-Meningioma/imputed_data.csv')

#Final Touches

In [None]:
#Change variable names to field names.

data_dictionary = pd.read_csv("/content/drive/MyDrive/NCDB-Meningioma/Data_Dictionary.csv", encoding = 'latin1', index_col = None, low_memory = False)
FieldNames = dict(zip(data_dictionary['Variable'], data_dictionary['Field Name']))
data.columns = data.columns.map(FieldNames)

In [None]:
#Manual label encoding.

data.loc[data['Histology'] == 'Meningioma (NOS)', 'Histology'] = 0
data.loc[data['Histology'] == 'Meningothelial meningioma', 'Histology'] = 1
data.loc[data['Histology'] == 'Fibrous meningioma', 'Histology'] = 2
data.loc[data['Histology'] == 'Psammomatous meningioma', 'Histology'] = 3
data.loc[data['Histology'] == 'Angiomatous meningioma', 'Histology'] = 4
data.loc[data['Histology'] == 'Hemangioblastic meningioma', 'Histology'] = 5
data.loc[data['Histology'] == 'Transitional meningioma', 'Histology'] = 6
data.loc[data['Histology'] == 'Clear cell/rhabdoid meningioma', 'Histology'] = 7
data.loc[data['Histology'] == 'Atypical meningioma/meningeal sarcomatosis', 'Histology'] = 8

data.loc[data['Grade'] == 'Grade II', 'Grade'] = 0
data.loc[data['Grade'] == 'Grade III', 'Grade'] = 1

data.loc[data['Facility Type'] == 'Academic/Research Program', 'Facility Type'] = 0
data.loc[data['Facility Type'] == 'Comprehensive Community Cancer Program', 'Facility Type'] = 1
data.loc[data['Facility Type'] == 'Integrated Network Cancer Program', 'Facility Type'] = 2
data.loc[data['Facility Type'] == 'Community Cancer Program', 'Facility Type'] = 3
data.loc[data['Facility Type'] == 'Other or Unknown', 'Facility Type'] = 4

data.loc[data['Facility Location'] == 'South Atlantic', 'Facility Location'] = 0
data.loc[data['Facility Location'] == 'East North Central', 'Facility Location'] = 1
data.loc[data['Facility Location'] == 'Middle Atlantic', 'Facility Location'] = 2
data.loc[data['Facility Location'] == 'Pacific', 'Facility Location'] = 3
data.loc[data['Facility Location'] == 'West South Central', 'Facility Location'] = 4
data.loc[data['Facility Location'] == 'West North Central', 'Facility Location'] = 5
data.loc[data['Facility Location'] == 'East South Central', 'Facility Location'] = 6
data.loc[data['Facility Location'] == 'New England', 'Facility Location'] = 7
data.loc[data['Facility Location'] == 'Mountain', 'Facility Location'] = 8
data.loc[data['Facility Location'] == 'Unknown or Other', 'Facility Location'] = 9

data.loc[data['Sex'] == 'Female', 'Sex'] = 0
data.loc[data['Sex'] == 'Male', 'Sex'] = 1

data.loc[data['Race'] == 'White', 'Race'] = 0
data.loc[data['Race'] == 'Black', 'Race'] = 1
data.loc[data['Race'] == 'Asian Indian or Pakistani', 'Race'] = 2
data.loc[data['Race'] == 'Chinese', 'Race'] = 3
data.loc[data['Race'] == 'Filipino', 'Race'] = 4
data.loc[data['Race'] == 'American Indian, Aleutian, or Eskimo', 'Race'] = 5
data.loc[data['Race'] == 'Vietnamese', 'Race'] = 6
data.loc[data['Race'] == 'Korean', 'Race'] = 7
data.loc[data['Race'] == 'Japanese', 'Race'] = 8
data.loc[data['Race'] == 'Hawaiian', 'Race'] = 9
data.loc[data['Race'] == 'Other or Unknown', 'Race'] = 10

data.loc[data['Hispanic Ethnicity'] == 'No', 'Hispanic Ethnicity'] = 0
data.loc[data['Hispanic Ethnicity'] == 'Yes', 'Hispanic Ethnicity'] = 1
data.loc[data['Hispanic Ethnicity'] == 'Unknown', 'Hispanic Ethnicity'] = 2

data.loc[data['Primary Payor'] == 'Private insurance', 'Primary Payor'] = 0
data.loc[data['Primary Payor'] == 'Medicare', 'Primary Payor'] = 1
data.loc[data['Primary Payor'] == 'Medicaid', 'Primary Payor'] = 2
data.loc[data['Primary Payor'] == 'Other government', 'Primary Payor'] = 3
data.loc[data['Primary Payor'] == 'Not insured', 'Primary Payor'] = 4
data.loc[data['Primary Payor'] == 'Unknown', 'Primary Payor'] = 5

data.loc[data['Charlson-Deyo Score'] == '0', 'Charlson-Deyo Score'] = 0
data.loc[data['Charlson-Deyo Score'] == '1', 'Charlson-Deyo Score'] = 1
data.loc[data['Charlson-Deyo Score'] == '2', 'Charlson-Deyo Score'] = 2
data.loc[data['Charlson-Deyo Score'] == 'Greater than 3', 'Charlson-Deyo Score'] = 3

data.loc[data['Laterality'] == 'Right', 'Laterality'] = 0
data.loc[data['Laterality'] == 'Left', 'Laterality'] = 1
data.loc[data['Laterality'] == 'Bilateral', 'Laterality'] = 2
data.loc[data['Laterality'] == 'Midline', 'Laterality'] = 3
data.loc[data['Laterality'] == 'Unknown', 'Laterality'] = 4

data.loc[data['Diagnostic Biopsy'] == 'No', 'Diagnostic Biopsy'] = 0
data.loc[data['Diagnostic Biopsy'] == 'Yes', 'Diagnostic Biopsy'] = 1
data.loc[data['Diagnostic Biopsy'] == 'Unknown', 'Diagnostic Biopsy'] = 2

data.loc[data['Karnofsky Performance Scale'] == 'KPS 0-20', 'Karnofsky Performance Scale'] = 0
data.loc[data['Karnofsky Performance Scale'] == 'KPS 21-40', 'Karnofsky Performance Scale'] = 1
data.loc[data['Karnofsky Performance Scale'] == 'KPS 41-60', 'Karnofsky Performance Scale'] = 2
data.loc[data['Karnofsky Performance Scale'] == 'KPS 61-80', 'Karnofsky Performance Scale'] = 3
data.loc[data['Karnofsky Performance Scale'] == 'KPS 81-100', 'Karnofsky Performance Scale'] = 4
data.loc[data['Karnofsky Performance Scale'] == 'Unknown', 'Karnofsky Performance Scale'] = 5

data.loc[data['Focality'] == 'Unifocal', 'Focality'] = 0
data.loc[data['Focality'] == 'Multifocal', 'Focality'] = 1
data.loc[data['Focality'] == 'Unknown', 'Focality'] = 2

data.loc[data['Tumor Size'] == '< 2 cm', 'Tumor Size'] = 0
data.loc[data['Tumor Size'] == '2 - 3.9 cm', 'Tumor Size'] = 1
data.loc[data['Tumor Size'] == '4 - 5.9 cm', 'Tumor Size'] = 2
data.loc[data['Tumor Size'] == '6 - 7.9 cm', 'Tumor Size'] = 3
data.loc[data['Tumor Size'] == '8 - 9.9 cm', 'Tumor Size'] = 4
data.loc[data['Tumor Size'] == '10 - 11.9 cm', 'Tumor Size'] = 5
data.loc[data['Tumor Size'] == '12 - 13.9 cm', 'Tumor Size'] = 6
data.loc[data['Tumor Size'] == '14 - 15.9 cm', 'Tumor Size'] = 7
data.loc[data['Tumor Size'] == '16 - 17.9 cm', 'Tumor Size'] = 8
data.loc[data['Tumor Size'] == '18 - 19.9 cm', 'Tumor Size'] = 9
data.loc[data['Tumor Size'] == '> 20 cm', 'Tumor Size'] = 10
data.loc[data['Tumor Size'] == 'Unknown', 'Tumor Size'] = 11

data.loc[data['Ki-67 Labeling Index'] == '0-20%', 'Ki-67 Labeling Index'] = 0
data.loc[data['Ki-67 Labeling Index'] == '21-40%', 'Ki-67 Labeling Index'] = 1
data.loc[data['Ki-67 Labeling Index'] == '41-60%', 'Ki-67 Labeling Index'] = 2
data.loc[data['Ki-67 Labeling Index'] == '61-80%', 'Ki-67 Labeling Index'] = 3
data.loc[data['Ki-67 Labeling Index'] == '81-100%', 'Ki-67 Labeling Index'] = 4
data.loc[data['Ki-67 Labeling Index'] == 'Normal (no percentage available)', 'Ki-67 Labeling Index'] = 5
data.loc[data['Ki-67 Labeling Index'] == 'Slightly elevated (no percentage available)', 'Ki-67 Labeling Index'] = 6
data.loc[data['Ki-67 Labeling Index'] == 'Elevated (no percentage available)', 'Ki-67 Labeling Index'] = 7
data.loc[data['Ki-67 Labeling Index'] == 'Unknown', 'Ki-67 Labeling Index'] = 8

data.loc[data['Extent of Resection'] == 'No resective surgery was performed', 'Extent of Resection'] = 0
data.loc[data['Extent of Resection'] == 'Gross total resection', 'Extent of Resection'] = 1
data.loc[data['Extent of Resection'] == 'Subtotal resection', 'Extent of Resection'] = 2
data.loc[data['Extent of Resection'] == 'Unknown', 'Extent of Resection'] = 3

data.loc[data['Resective Surgery'] == 'No', 'Resective Surgery'] = 0
data.loc[data['Resective Surgery'] == 'Yes', 'Resective Surgery'] = 1
data.loc[data['Resective Surgery'] == 'Unknown', 'Resective Surgery'] = 2

data.loc[data['Radiation Treatment'] == 'No', 'Radiation Treatment'] = 0
data.loc[data['Radiation Treatment'] == 'Yes', 'Radiation Treatment'] = 1
data.loc[data['Radiation Treatment'] == 'Unknown', 'Radiation Treatment'] = 2

columns = ['Histology', 'Grade', 'Facility Type', 'Facility Location', 'Sex', 'Race', 'Hispanic Ethnicity','Primary Payor',
          'Charlson-Deyo Score', 'Laterality', 'Diagnostic Biopsy', 'Karnofsky Performance Scale',
           'Focality', 'Tumor Size', 'Extent of Resection', 'Resective Surgery', 'Radiation Treatment']

for column in columns:
    data[column] = data[column].astype(int)

In [None]:
#Save data for Gradio.

gradio = data.copy()

gradio.columns = gradio.columns.str.replace(' ', '_', regex=True)
gradio.columns = gradio.columns.str.replace('-', '', regex=True)

gradio.to_csv('/content/drive/MyDrive/NCDB-Meningioma/gradio_data.csv')

In [None]:
#Save final data.

data.to_csv('/content/drive/MyDrive/NCDB-Meningioma/final_data.csv')