In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [None]:
#Open csv file.

data = pd.read_csv("/content/drive/MyDrive/NCDB-PUFs/Bone & Joint - 2020 (Labeled).csv", index_col='PUF_CASE_ID')
data.shape

In [None]:
#See all columns.

print(list(data.columns))

In [None]:
#Define variables of interest (predictor variables, inclusion/exclusion criteria, outcomes of interest).

variables = ['FACILITY_TYPE_CD', 'FACILITY_LOCATION_CD', 'AGE', 'SEX', 'RACE', 'SPANISH_HISPANIC_ORIGIN', 'INSURANCE_STATUS', 'CDCC_TOTAL_BEST', 'YEAR_OF_DIAGNOSIS', 'PRIMARY_SITE', 'HISTOLOGY', 'DIAGNOSTIC_CONFIRMATION', 'DIAGNOSTIC_BIOPSY', 'TUMOR_SIZE', 'CS_SITESPECIFIC_FACTOR_1', 'CS_SITESPECIFIC_FACTOR_2', 'TNM_PATH_N', 'TNM_PATH_M', 'RX_SUMM_SURG_PRIM_SITE', 'RX_SUMM_SURGICAL_MARGINS', 'PHASE_I_COMBINED', 'PHASE_II_COMBINED', 'PHASE_III_COMBINED', 'RX_SUMM_CHEMO', 'DX_LASTCONTACT_DEATH_MONTHS', 'PUF_VITAL_STATUS']

In [None]:
#Remove unwanted columns and check data shape.

data = data[variables]

data.shape

#Inclusion Criteria

In [None]:
#Identify meningioma patients with ICD-O-3 codes.

icd_codes = [9370, 9371, 9372]
data = data[data[['HISTOLOGY']].isin(icd_codes).any(axis=1)]

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for diagnostic confirmation.

data = data[data['DIAGNOSTIC_CONFIRMATION'] == 'Positive histology']
data = data.drop(['DIAGNOSTIC_CONFIRMATION'], axis=1)

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for 'PRIMARY_SITE'.

sites = ['C412', 'C414']
data = data[data[['PRIMARY_SITE']].isin(sites).any(axis=1)]

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for age.

data = data[data['AGE'] > 18]

print('Number of included patients:', data.shape[0])

In [None]:
data['YEAR_OF_DIAGNOSIS'].value_counts(normalize=False, dropna=False)
data = data.drop(['YEAR_OF_DIAGNOSIS'], axis=1)

#Exclusion Criteria

In [None]:
#Exclude patients without outcome information.

before = data.shape[0]
data = data[data['DX_LASTCONTACT_DEATH_MONTHS'].notna()]
data = data[(data['PUF_VITAL_STATUS'] != 'Unknown')]
after = data.shape[0]

excluded = before - after
print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

#Renaming and Merging Response Values

In [None]:
#Simplify the column 'HISTOLOGY'.

data.loc[data['HISTOLOGY'] == 9370, 'HISTOLOGY'] = 'Chordoma, NOS'
data.loc[data['HISTOLOGY'] == 9371, 'HISTOLOGY'] = 'Chondroid chordoma'
data.loc[data['HISTOLOGY'] == 9372, 'HISTOLOGY'] = 'Dedifferentiated chordoma'

data['HISTOLOGY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'HISTOLOGY'.

data.loc[data['PRIMARY_SITE'] == 'C412', 'PRIMARY_SITE'] = 'Spine'
data.loc[data['PRIMARY_SITE'] == 'C414', 'PRIMARY_SITE'] = 'Sacrum/Pelvis'

data['PRIMARY_SITE'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'TUMOR_SIZE'.

data.loc[data['TUMOR_SIZE'] == 'Microscopic foci only', 'TUMOR_SIZE'] = 'Unknown'
data.loc[data['TUMOR_SIZE'] == 'No mass or tumor found', 'TUMOR_SIZE'] = 'Unknown'

data['TUMOR_SIZE'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'CS_SITESPECIFIC_FACTOR_1'.

data.loc[data['CS_SITESPECIFIC_FACTOR_1'] == 'Microscopic foci only', 'CS_SITESPECIFIC_FACTOR_1'] = 'Unknown'
data.loc[data['CS_SITESPECIFIC_FACTOR_1'] == 'No mass or tumor found', 'CS_SITESPECIFIC_FACTOR_1'] = 'Unknown'

data['CS_SITESPECIFIC_FACTOR_1'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'TUMOR_SIZE'.

data.loc[data['CS_SITESPECIFIC_FACTOR_2'] == 'Microscopic foci only', 'CS_SITESPECIFIC_FACTOR_2'] = 'Unknown'
data.loc[data['CS_SITESPECIFIC_FACTOR_2'] == 'No mass or tumor found', 'CS_SITESPECIFIC_FACTOR_2'] = 'Unknown'

data['CS_SITESPECIFIC_FACTOR_2'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'TNM_PATH_N'.

data.loc[data['TNM_PATH_N'] == 'pNX', 'TNM_PATH_N'] = 'Unknown or not applicable'
data.loc[data['TNM_PATH_N'] == 'pN0', 'TNM_PATH_N'] = 'No'
data.loc[data['TNM_PATH_N'] == 'pN1', 'TNM_PATH_N'] = 'Yes'

data['TNM_PATH_N'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'TNM_PATH_M'.

data.loc[data['TNM_PATH_M'] == 'pMX', 'TNM_PATH_M'] = 'Unknown or not applicable'
data.loc[data['TNM_PATH_M'] == 'pM0', 'TNM_PATH_M'] = 'No'
data.loc[data['TNM_PATH_M'] == 'pM1', 'TNM_PATH_M'] = 'Yes'
data.loc[data['TNM_PATH_M'] == 'pM1a', 'TNM_PATH_M'] = 'Yes'
data.loc[data['TNM_PATH_M'] == 'pM1b', 'TNM_PATH_M'] = 'Yes'
data.loc[data['TNM_PATH_M'] == 'c1', 'TNM_PATH_M'] = 'Yes'
data.loc[data['TNM_PATH_M'] == 'c1A', 'TNM_PATH_M'] = 'Yes'
data.loc[data['TNM_PATH_M'] == 'c1B', 'TNM_PATH_M'] = 'Yes'

data['TNM_PATH_M'].value_counts(normalize=False, dropna=False)

In [None]:
#Create the column 'RX_SUMM_SURG_PRIM_SITE'.

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Radical excision or resection of lesion with limb salvage', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Local excision', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Partial resection', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Surgery, NOS', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Local tumor destruction or excision, NOS', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Major amputation, NOS', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Partial amputation of limb', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Hindquarter, including ilium or hip bone', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Hemipelvectomy, NOS', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Internal hemipelvectomy', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Local tumor destruction', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Amputation of limb', 'RX_SUMM_SURG_PRIM_SITE'] = 'Yes'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'No surgery was performed', 'RX_SUMM_SURG_PRIM_SITE'] = 'No'

data['RX_SUMM_SURG_PRIM_SITE'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'RX_SUMM_SURGICAL_MARGINS'.

data.loc[data['RX_SUMM_SURGICAL_MARGINS'] == 'Margins not evaluable', 'RX_SUMM_SURGICAL_MARGINS'] = 'Unknown'
data.loc[data['RX_SUMM_SURGICAL_MARGINS'] == 'Microscopic residual tumor', 'RX_SUMM_SURGICAL_MARGINS'] = 'Residual tumor'
data.loc[data['RX_SUMM_SURGICAL_MARGINS'] == 'Macroscopic residual tumor', 'RX_SUMM_SURGICAL_MARGINS'] = 'Residual tumor'

data['RX_SUMM_SURGICAL_MARGINS'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'PHASE_I_COMBINED' in a new column 'PHASE_I_RADIOTHERAPY'.

data.loc[data['PHASE_I_COMBINED'] == 'No radiation treatment', 'PHASE_I_RADIOTHERAPY'] = 'No'
data.loc[data['PHASE_I_COMBINED'] == 'Unknown', 'PHASE_I_RADIOTHERAPY'] = 'Unknown'
data['PHASE_I_RADIOTHERAPY'] = data['PHASE_I_RADIOTHERAPY'].fillna('Yes')
data = data.drop(['PHASE_I_COMBINED'], axis=1)

data['PHASE_I_RADIOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'PHASE_II_COMBINED' in a new column 'PHASE_II_RADIOTHERAPY'.

data.loc[data['PHASE_II_COMBINED'] == 'No radiation treatment', 'PHASE_II_RADIOTHERAPY'] = 'No'
data.loc[data['PHASE_II_COMBINED'] == 'Unknown', 'PHASE_II_RADIOTHERAPY'] = 'Unknown'
data['PHASE_II_RADIOTHERAPY'] = data['PHASE_II_RADIOTHERAPY'].fillna('Yes')
data = data.drop(['PHASE_II_COMBINED'], axis=1)

data['PHASE_II_RADIOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'PHASE_III_COMBINED' in a new column 'PHASE_III_RADIOTHERAPY'.

data.loc[data['PHASE_III_COMBINED'] == 'No radiation treatment', 'PHASE_III_RADIOTHERAPY'] = 'No'
data.loc[data['PHASE_III_COMBINED'] == 'Unknown', 'PHASE_III_RADIOTHERAPY'] = 'Unknown'
data['PHASE_III_RADIOTHERAPY'] = data['PHASE_III_RADIOTHERAPY'].fillna('Yes')
data = data.drop(['PHASE_III_COMBINED'], axis=1)

data['PHASE_III_RADIOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Merge 'PHASE_I_RADIOTHERAPY', 'PHASE_II_RADIOTHERAPY', and 'PHASE_III_RADIOTHERAPY' in 'RADIATION_TREATMENT' column.

data.loc[(data['PHASE_I_RADIOTHERAPY'] == 'Yes') | (data['PHASE_II_RADIOTHERAPY'] == 'Yes') | (data['PHASE_III_RADIOTHERAPY'] == 'Yes') , 'RADIATION_TREATMENT'] = 'Yes'
data.loc[(data['PHASE_I_RADIOTHERAPY'] == 'No') & (data['PHASE_II_RADIOTHERAPY'] == 'No') & (data['PHASE_III_RADIOTHERAPY'] == 'No') , 'RADIATION_TREATMENT'] = 'No'
data['RADIATION_TREATMENT'] = data['RADIATION_TREATMENT'].fillna('Unknown')
data = data.drop(['PHASE_I_RADIOTHERAPY', 'PHASE_II_RADIOTHERAPY', 'PHASE_III_RADIOTHERAPY'], axis=1)

data['RADIATION_TREATMENT'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify the column 'RX_SUMM_CHEMO'.

data.loc[data['RX_SUMM_CHEMO'] == 'None, chemotherapy was not part of the planned first course of therapy', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Single-agent chemotherapy administered as first course therapy', 'RX_SUMM_CHEMO'] = 'Yes'
data.loc[data['RX_SUMM_CHEMO'] == 'Multiagent chemotherapy administered as first course therapy', 'RX_SUMM_CHEMO'] = 'Yes'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was recommended by the physician, but was refused by the patient, a family member, or guardian', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy administered as first course therapy, but the type and number of agents is not documented in patient record', 'RX_SUMM_CHEMO'] = 'Yes'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was not recommended/administered because it was contraindicated due to patient risk factors', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was not administered because the patient died prior to planned or recommended therapy', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was was recommended but was not administered as part of the first course of therapy', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Unknown', 'RX_SUMM_CHEMO'] = 'Unknown'

data['RX_SUMM_CHEMO'].value_counts(normalize=False, dropna=False)

In [None]:
#Save filtered data.

data.to_csv('/content/drive/MyDrive/NCDB-Chordoma/data.csv')

#Imputation

In [None]:
#Define numerical and categorical columns.

num_cols = list(data.select_dtypes('number').columns)
print('Numerical columns: {}'.format(num_cols), '\n')

cat_cols = list(data.select_dtypes('object').columns)
print('Categorical columns: {}'.format(cat_cols))

In [None]:
#Remove outcomes.

num_cols_remove = ['DX_LASTCONTACT_DEATH_MONTHS']
cat_cols_remove = ['PUF_VITAL_STATUS']

num_cols = [i for i in num_cols if i not in num_cols_remove]
cat_cols = [i for i in cat_cols if i not in cat_cols_remove]

In [None]:
#Check missing values for numerical columns.

missing_num = data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_num, '\n')

missing_num = pd.DataFrame(missing_num)

missing_num.columns = ['Value']

missing_num = missing_num[missing_num['Value'] > 0]

print('Numerical variables with missing values: ', list(missing_num.index), '\n')

print('Number of numerical variables with missing values: ', len(list(missing_num.index)), '\n')

missing_num = missing_num[missing_num['Value'] > 25]

missing_num = list(missing_num.index)

print('Excluded numerical variables: ', missing_num)

In [None]:
#Drop numerical columns with missing values over 25%.

data.drop(missing_num, axis=1, inplace=True)

In [None]:
#Define new numerical columns.

num_cols = [x for x in num_cols if x not in missing_num]

In [None]:
#Impute missing numerical values.

num_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

In [None]:
#Check missing values for categorical columns.

missing_cat = data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_cat, '\n')

missing_cat = pd.DataFrame(missing_cat)

missing_cat.columns = ['Value']

missing_cat = missing_cat[missing_cat['Value'] > 0]

print('Categorical variables with missing values: ', list(missing_cat.index), '\n')

print('Number of categorical variables with missing values: ', len(list(missing_cat.index)), '\n')

missing_cat = missing_cat[missing_cat['Value'] > 25]

missing_cat = list(missing_cat.index)

print('Excluded categorical variables: ', missing_cat)

In [None]:
#Drop categorical columns with missing values over 25%.

data.drop(missing_cat, axis=1, inplace=True)

In [None]:
#Define new categorical columns.

cat_cols = [x for x in cat_cols if x not in missing_cat]

In [None]:
#Replace missing categorical values with 'Unknown'.

for col in cat_cols:
    data[col].fillna(value='Unknown', inplace=True)

In [None]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/NCDB-Chordoma/imputed_data.csv')

#Final Touches

In [None]:
#Change variable names to field names.

data_dictionary = pd.read_csv("/content/drive/MyDrive/NCDB-Chordoma/Data_Dictionary.csv", encoding = 'latin1', index_col = None, low_memory = False)
FieldNames = dict(zip(data_dictionary['Variable'], data_dictionary['Field Name']))
data.columns = data.columns.map(FieldNames)

In [None]:
#Manual label encoding.

data.loc[data['Facility Type'] == 'Academic/Research Program', 'Facility Type'] = 0
data.loc[data['Facility Type'] == 'Comprehensive Community Cancer Program', 'Facility Type'] = 1
data.loc[data['Facility Type'] == 'Integrated Network Cancer Program', 'Facility Type'] = 2
data.loc[data['Facility Type'] == 'Community Cancer Program', 'Facility Type'] = 3
data.loc[data['Facility Type'] == 'Other or Unknown', 'Facility Type'] = 4

data.loc[data['Facility Location'] == 'South Atlantic', 'Facility Location'] = 0
data.loc[data['Facility Location'] == 'East North Central', 'Facility Location'] = 1
data.loc[data['Facility Location'] == 'Middle Atlantic', 'Facility Location'] = 2
data.loc[data['Facility Location'] == 'Pacific', 'Facility Location'] = 3
data.loc[data['Facility Location'] == 'West South Central', 'Facility Location'] = 4
data.loc[data['Facility Location'] == 'West North Central', 'Facility Location'] = 5
data.loc[data['Facility Location'] == 'East South Central', 'Facility Location'] = 6
data.loc[data['Facility Location'] == 'New England', 'Facility Location'] = 7
data.loc[data['Facility Location'] == 'Mountain', 'Facility Location'] = 8
data.loc[data['Facility Location'] == 'Unknown or Other', 'Facility Location'] = 9

data.loc[data['Sex'] == 'Female', 'Sex'] = 0
data.loc[data['Sex'] == 'Male', 'Sex'] = 1

data.loc[data['Race'] == 'White', 'Race'] = 0
data.loc[data['Race'] == 'Black', 'Race'] = 1
data.loc[data['Race'] == 'Asian Indian or Pakistani', 'Race'] = 2
data.loc[data['Race'] == 'American Indian, Aleutian, or Eskimo', 'Race'] = 3
data.loc[data['Race'] == 'Chinese', 'Race'] = 4
data.loc[data['Race'] == 'Filipino', 'Race'] = 5
data.loc[data['Race'] == 'Vietnamese', 'Race'] = 6
data.loc[data['Race'] == 'Hawaiian', 'Race'] = 7
data.loc[data['Race'] == 'Japanese', 'Race'] = 8
data.loc[data['Race'] == 'Korean', 'Race'] = 9
data.loc[data['Race'] == 'Other or Unknown', 'Race'] = 10

data.loc[data['Hispanic Ethnicity'] == 'No', 'Hispanic Ethnicity'] = 0
data.loc[data['Hispanic Ethnicity'] == 'Yes', 'Hispanic Ethnicity'] = 1
data.loc[data['Hispanic Ethnicity'] == 'Unknown', 'Hispanic Ethnicity'] = 2

data.loc[data['Insurance Status'] == 'Private insurance', 'Insurance Status'] = 0
data.loc[data['Insurance Status'] == 'Medicare', 'Insurance Status'] = 1
data.loc[data['Insurance Status'] == 'Medicaid', 'Insurance Status'] = 2
data.loc[data['Insurance Status'] == 'Other government', 'Insurance Status'] = 3
data.loc[data['Insurance Status'] == 'Not insured', 'Insurance Status'] = 4
data.loc[data['Insurance Status'] == 'Unknown', 'Insurance Status'] = 5

data.loc[data['Charlson-Deyo Score'] == '0', 'Charlson-Deyo Score'] = 0
data.loc[data['Charlson-Deyo Score'] == '1', 'Charlson-Deyo Score'] = 1
data.loc[data['Charlson-Deyo Score'] == '2', 'Charlson-Deyo Score'] = 2
data.loc[data['Charlson-Deyo Score'] == 'Greater than 3', 'Charlson-Deyo Score'] = 3

data.loc[data['Diagnostic Biopsy'] == 'No', 'Diagnostic Biopsy'] = 0
data.loc[data['Diagnostic Biopsy'] == 'Yes', 'Diagnostic Biopsy'] = 1
data.loc[data['Diagnostic Biopsy'] == 'Unknown', 'Diagnostic Biopsy'] = 2

data.loc[data['Primary Site'] == 'Spine', 'Primary Site'] = 0
data.loc[data['Primary Site'] == 'Sacrum/Pelvis', 'Primary Site'] = 1

data.loc[data['Histology'] == 'Chordoma, NOS', 'Histology'] = 0
data.loc[data['Histology'] == 'Chondroid chordoma', 'Histology'] = 1
data.loc[data['Histology'] == 'Dedifferentiated chordoma', 'Histology'] = 2

data.loc[data['Tumor Size (Largest Diameter)'] == '< 2 cm', 'Tumor Size (Largest Diameter)'] = 0
data.loc[data['Tumor Size (Largest Diameter)'] == '2 - 3.9 cm', 'Tumor Size (Largest Diameter)'] = 1
data.loc[data['Tumor Size (Largest Diameter)'] == '4 - 5.9 cm', 'Tumor Size (Largest Diameter)'] = 2
data.loc[data['Tumor Size (Largest Diameter)'] == '6 - 7.9 cm', 'Tumor Size (Largest Diameter)'] = 3
data.loc[data['Tumor Size (Largest Diameter)'] == '8 - 9.9 cm', 'Tumor Size (Largest Diameter)'] = 4
data.loc[data['Tumor Size (Largest Diameter)'] == '10 - 11.9 cm', 'Tumor Size (Largest Diameter)'] = 5
data.loc[data['Tumor Size (Largest Diameter)'] == '12 - 13.9 cm', 'Tumor Size (Largest Diameter)'] = 6
data.loc[data['Tumor Size (Largest Diameter)'] == '14 - 15.9 cm', 'Tumor Size (Largest Diameter)'] = 7
data.loc[data['Tumor Size (Largest Diameter)'] == '16 - 17.9 cm', 'Tumor Size (Largest Diameter)'] = 8
data.loc[data['Tumor Size (Largest Diameter)'] == '18 - 19.9 cm', 'Tumor Size (Largest Diameter)'] = 9
data.loc[data['Tumor Size (Largest Diameter)'] == '> 20 cm', 'Tumor Size (Largest Diameter)'] = 10
data.loc[data['Tumor Size (Largest Diameter)'] == 'Unknown', 'Tumor Size (Largest Diameter)'] = 11

data.loc[data['Tumor Size (2nd Largest Diameter)'] == '< 2 cm', 'Tumor Size (2nd Largest Diameter)'] = 0
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '2 - 3.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 1
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '4 - 5.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 2
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '6 - 7.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 3
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '8 - 9.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 4
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '10 - 11.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 5
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '12 - 13.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 6
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '14 - 15.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 7
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '16 - 17.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 8
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '18 - 19.9 cm', 'Tumor Size (2nd Largest Diameter)'] = 9
data.loc[data['Tumor Size (2nd Largest Diameter)'] == '> 20 cm', 'Tumor Size (2nd Largest Diameter)'] = 10
data.loc[data['Tumor Size (2nd Largest Diameter)'] == 'Unknown', 'Tumor Size (2nd Largest Diameter)'] = 11

data.loc[data['Tumor Size (3rd Largest Diameter)'] == '< 2 cm', 'Tumor Size (3rd Largest Diameter)'] = 0
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '2 - 3.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 1
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '4 - 5.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 2
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '6 - 7.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 3
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '8 - 9.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 4
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '10 - 11.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 5
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '12 - 13.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 6
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '14 - 15.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 7
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '16 - 17.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 8
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '18 - 19.9 cm', 'Tumor Size (3rd Largest Diameter)'] = 9
data.loc[data['Tumor Size (3rd Largest Diameter)'] == '> 20 cm', 'Tumor Size (3rd Largest Diameter)'] = 10
data.loc[data['Tumor Size (3rd Largest Diameter)'] == 'Unknown', 'Tumor Size (3rd Largest Diameter)'] = 11

data.loc[data['Regional Lymph Nodes'] == 'No', 'Regional Lymph Nodes'] = 0
data.loc[data['Regional Lymph Nodes'] == 'Yes', 'Regional Lymph Nodes'] = 1
data.loc[data['Regional Lymph Nodes'] == 'Unknown or not applicable', 'Regional Lymph Nodes'] = 2

data.loc[data['Distant Metastasis'] == 'No', 'Distant Metastasis'] = 0
data.loc[data['Distant Metastasis'] == 'Yes', 'Distant Metastasis'] = 1
data.loc[data['Distant Metastasis'] == 'Unknown or not applicable', 'Distant Metastasis'] = 2

data.loc[data['Surgery'] == 'No', 'Surgery'] = 0
data.loc[data['Surgery'] == 'Yes', 'Surgery'] = 1
data.loc[data['Surgery'] == 'Unknown', 'Surgery'] = 2

data.loc[data['Surgical Margins'] == 'No residual tumor', 'Surgical Margins'] = 0
data.loc[data['Surgical Margins'] == 'Residual tumor', 'Surgical Margins'] = 1
data.loc[data['Surgical Margins'] == 'No surgery was performed', 'Surgical Margins'] = 2
data.loc[data['Surgical Margins'] == 'Unknown', 'Surgical Margins'] = 3

data.loc[data['Radiation Treatment'] == 'No', 'Radiation Treatment'] = 0
data.loc[data['Radiation Treatment'] == 'Yes', 'Radiation Treatment'] = 1
data.loc[data['Radiation Treatment'] == 'Unknown', 'Radiation Treatment'] = 2

data.loc[data['Chemotherapy'] == 'No', 'Chemotherapy'] = 0
data.loc[data['Chemotherapy'] == 'Yes', 'Chemotherapy'] = 1
data.loc[data['Chemotherapy'] == 'Unknown', 'Chemotherapy'] = 2

columns = ['Facility Type', 'Facility Location', 'Age at Diagnosis', 'Sex', 'Race', 'Hispanic Ethnicity', 'Insurance Status', 'Charlson-Deyo Score', 'Histology', 'Primary Site', 'Diagnostic Biopsy', 'Tumor Size (Largest Diameter)', 'Tumor Size (2nd Largest Diameter)', 'Tumor Size (3rd Largest Diameter)', 'Regional Lymph Nodes', 'Distant Metastasis', 'Surgery', 'Surgical Margins', 'Radiation Treatment', 'Chemotherapy']

for column in columns:
    data[column] = data[column].astype(int)

In [None]:
#Save data for Gradio.

gradio = data.copy()

gradio.columns = gradio.columns.str.replace(' ', '_', regex=True)
gradio.columns = gradio.columns.str.replace('-', '', regex=True)

gradio.to_csv('/content/drive/MyDrive/NCDB-Chordoma/gradio_data.csv')

In [None]:
#Save final data.

data.to_csv('/content/drive/MyDrive/NCDB-Chordoma/final_data.csv')