In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

In [None]:
#Open data file.

data = pd.read_csv("/content/drive/MyDrive/NCDB-PUFs/Brain - 2020.csv", index_col = None)
data.shape

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print(list(data.columns))

In [None]:
#Define variables of interest.

variables = ['BRAIN_MOL_MARKERS', 'DIAGNOSTIC_CONFIRMATION', 'YEAR_OF_DIAGNOSIS', 'AGE', 'SEX', 'RACE', 'SPANISH_HISPANIC_ORIGIN', 'INSURANCE_STATUS', 'FACILITY_TYPE_CD', 'FACILITY_LOCATION_CD', 'NO_HSD_QUAR_2016', 'MED_INC_QUAR_2016', 'UR_CD_13', 'CDCC_TOTAL_BEST', 'TUMOR_SIZE_SUMMARY_2016', 'METHYLATION_O6MGMT', 'RX_SUMM_SURG_PRIM_SITE', 'READM_HOSP_30_DAYS', 'SURG_DISCHARGE_DAYS', 'NUMBER_PHASES_RAD_RX', 'RX_SUMM_CHEMO', 'RX_SUMM_IMMUNOTHERAPY', 'DX_LASTCONTACT_DEATH_MONTHS', 'PUF_VITAL_STATUS']

In [None]:
#Remove unwanted columns and check data shape.

data = data[variables]

data.shape

In [None]:
#Identify GBMs with 'BRAIN_MOL_MARKERS'.

data = data[(data['BRAIN_MOL_MARKERS'] == 5)]
data = data.drop(['BRAIN_MOL_MARKERS'], axis=1)

print('Number of included patients:', data.shape[0])

In [None]:
#Apply inclusion criteria for 'DIAGNOSTIC_CONFIRMATION'.

data = data[(data['DIAGNOSTIC_CONFIRMATION'] == 1) | (data['DIAGNOSTIC_CONFIRMATION'] == 3)]
data = data.drop(['DIAGNOSTIC_CONFIRMATION'], axis=1)

print('Number of included patients:', data.shape[0])

In [None]:
#Change response values for 'AGE'.

data.loc[data['AGE'] == 000, 'AGE'] = 0
data.loc[data['AGE'] == 999, 'AGE'] = np.nan

data['AGE'].value_counts(normalize=False, dropna=False)

In [None]:
#Apply inclusion criteria for age.

data = data[data['AGE'] > 18]

print('Number of included patients:', data.shape[0])

In [None]:
#Exclude patients with missing survival data.

before = data.shape[0]
data = data[data['DX_LASTCONTACT_DEATH_MONTHS'].notna()]
data = data[data['PUF_VITAL_STATUS'].notna()]
after = data.shape[0]

excluded = before - after
print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#See the years of diagnosis for the included patients.

data['YEAR_OF_DIAGNOSIS'].value_counts(normalize=False, dropna=False)
data = data.drop(['YEAR_OF_DIAGNOSIS'], axis=1)

In [None]:
#Change response values to strings for 'SEX'.

data.loc[data['SEX'] == 1, 'SEX'] = 'Male'
data.loc[data['SEX'] == 2, 'SEX'] = 'Female'

data['SEX'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'RACE'.

data.loc[data['RACE'] == 1, 'RACE'] = 'White'
data.loc[data['RACE'] == 2, 'RACE'] = 'Black'
data.loc[data['RACE'] == 3, 'RACE'] = 'American Indian, Aleutian, or Eskimo'
data.loc[data['RACE'] == 4, 'RACE'] = 'Chinese'
data.loc[data['RACE'] == 5, 'RACE'] = 'Japanese'
data.loc[data['RACE'] == 6, 'RACE'] = 'Filipino'
data.loc[data['RACE'] == 7, 'RACE'] = 'Hawaiian'
data.loc[data['RACE'] == 8, 'RACE'] = 'Korean'
data.loc[data['RACE'] == 10, 'RACE'] = 'Vietnamese'
data.loc[data['RACE'] == 11, 'RACE'] = 'Laotian'
data.loc[data['RACE'] == 12, 'RACE'] = 'Hmong'
data.loc[data['RACE'] == 13, 'RACE'] = 'Kampuchean'
data.loc[data['RACE'] == 14, 'RACE'] = 'Thai'
data.loc[data['RACE'] == 15, 'RACE'] = 'Asian Indian or Pakistani'
data.loc[data['RACE'] == 16, 'RACE'] = 'Asian Indian or Pakistani'
data.loc[data['RACE'] == 17, 'RACE'] = 'Pakistani'
data.loc[data['RACE'] == 20, 'RACE'] = 'Micronesian'
data.loc[data['RACE'] == 21, 'RACE'] = 'Chamorro/Chamoru'
data.loc[data['RACE'] == 22, 'RACE'] = 'Guamanian'
data.loc[data['RACE'] == 25, 'RACE'] = 'Polynesian'
data.loc[data['RACE'] == 26, 'RACE'] = 'Tahitian'
data.loc[data['RACE'] == 27, 'RACE'] = 'Samoan'
data.loc[data['RACE'] == 28, 'RACE'] = 'Tongan'
data.loc[data['RACE'] == 30, 'RACE'] = 'Melanesian'
data.loc[data['RACE'] == 31, 'RACE'] = 'Fiji Islander'
data.loc[data['RACE'] == 32, 'RACE'] = 'New Guinean'
data.loc[data['RACE'] == 96, 'RACE'] = 'Other'
data.loc[data['RACE'] == 97, 'RACE'] = 'Pacific Islander'
data.loc[data['RACE'] == 98, 'RACE'] = 'Other'
data.loc[data['RACE'] == 99, 'RACE'] = np.nan

data['RACE'].value_counts(normalize=False, dropna=False)

In [None]:
#Merge infrequent values (less than 100) to the value 'Other' for 'RACE'.

race_counts = data['RACE'].value_counts(normalize=False, dropna=True)
infrequent_categories = race_counts[race_counts < 100].index
data['RACE'] = data['RACE'].apply(lambda x: 'Other' if x in infrequent_categories else x)

data['RACE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'SPANISH_HISPANIC_ORIGIN'.

data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 0, 'SPANISH_HISPANIC_ORIGIN'] = 'Non-Spanish, non-Hispanic'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 1, 'SPANISH_HISPANIC_ORIGIN'] = 'Mexican/Chicano'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 2, 'SPANISH_HISPANIC_ORIGIN'] = 'Puerto Rican'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 3, 'SPANISH_HISPANIC_ORIGIN'] = 'Cuban'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 4, 'SPANISH_HISPANIC_ORIGIN'] = 'South or Central America'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 5, 'SPANISH_HISPANIC_ORIGIN'] = 'Other specified Spanish/Hispanic origin'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 6, 'SPANISH_HISPANIC_ORIGIN'] = 'Spanish, NOS Hispanic, NOS Latino, NOS'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 7, 'SPANISH_HISPANIC_ORIGIN'] = 'Spanish surname only'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 8, 'SPANISH_HISPANIC_ORIGIN'] = 'Dominican Republic'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 9, 'SPANISH_HISPANIC_ORIGIN'] = np.nan

data['SPANISH_HISPANIC_ORIGIN'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify response values for 'SPANISH_HISPANIC_ORIGIN'.

spanish_hispanic_categories = ['Mexican/Chicano', 'Puerto Rican', 'Cuban', 'South or Central America', 'Other specified Spanish/Hispanic origin', 'Spanish, NOS Hispanic, NOS Latino, NOS', 'Dominican Republic']

data['SPANISH_HISPANIC_ORIGIN'] = data['SPANISH_HISPANIC_ORIGIN'].apply(lambda x: 'Yes' if x in spanish_hispanic_categories else x)

data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 'Non-Spanish, non-Hispanic', 'SPANISH_HISPANIC_ORIGIN'] = 'No'
data.loc[data['SPANISH_HISPANIC_ORIGIN'] == 'Spanish surname only', 'SPANISH_HISPANIC_ORIGIN'] = 'No'

data['SPANISH_HISPANIC_ORIGIN'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'INSURANCE_STATUS'.

data.loc[data['INSURANCE_STATUS'] == 0, 'INSURANCE_STATUS'] = 'Not insured'
data.loc[data['INSURANCE_STATUS'] == 1, 'INSURANCE_STATUS'] = 'Private insurance'
data.loc[data['INSURANCE_STATUS'] == 2, 'INSURANCE_STATUS'] = 'Medicaid'
data.loc[data['INSURANCE_STATUS'] == 3, 'INSURANCE_STATUS'] = 'Medicare'
data.loc[data['INSURANCE_STATUS'] == 4, 'INSURANCE_STATUS'] = 'Other government'
data.loc[data['INSURANCE_STATUS'] == 9, 'INSURANCE_STATUS'] = np.nan

data['INSURANCE_STATUS'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'FACILITY_TYPE_CD'.

data.loc[data['FACILITY_TYPE_CD'] == 1, 'FACILITY_TYPE_CD'] = 'Community Cancer Program'
data.loc[data['FACILITY_TYPE_CD'] == 2, 'FACILITY_TYPE_CD'] = 'Comprehensive Community Cancer Program'
data.loc[data['FACILITY_TYPE_CD'] == 3, 'FACILITY_TYPE_CD'] = 'Academic/Research Program'
data.loc[data['FACILITY_TYPE_CD'] == 4, 'FACILITY_TYPE_CD'] = 'Integrated Network Cancer Program'

data['FACILITY_TYPE_CD'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'FACILITY_TYPE_CD'.

data.loc[data['FACILITY_TYPE_CD'] == 'Comprehensive Community Cancer Program', 'FACILITY_TYPE_CD'] = 'Community Cancer Program'

data['FACILITY_TYPE_CD'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'FACILITY_LOCATION_CD'.

data.loc[data['FACILITY_LOCATION_CD'] == 1, 'FACILITY_LOCATION_CD'] = 'New England'
data.loc[data['FACILITY_LOCATION_CD'] == 2, 'FACILITY_LOCATION_CD'] = 'Middle Atlantic'
data.loc[data['FACILITY_LOCATION_CD'] == 3, 'FACILITY_LOCATION_CD'] = 'South Atlantic'
data.loc[data['FACILITY_LOCATION_CD'] == 4, 'FACILITY_LOCATION_CD'] = 'East North Central'
data.loc[data['FACILITY_LOCATION_CD'] == 5, 'FACILITY_LOCATION_CD'] = 'East South Central'
data.loc[data['FACILITY_LOCATION_CD'] == 6, 'FACILITY_LOCATION_CD'] = 'West North Central'
data.loc[data['FACILITY_LOCATION_CD'] == 7, 'FACILITY_LOCATION_CD'] = 'West South Central'
data.loc[data['FACILITY_LOCATION_CD'] == 8, 'FACILITY_LOCATION_CD'] = 'Mountain'
data.loc[data['FACILITY_LOCATION_CD'] == 9, 'FACILITY_LOCATION_CD'] = 'Pacific'

data['FACILITY_LOCATION_CD'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'FACILITY_LOCATION_CD'.

data.loc[data['FACILITY_LOCATION_CD'] == 'Middle Atlantic', 'FACILITY_LOCATION_CD'] = 'Atlantic'
data.loc[data['FACILITY_LOCATION_CD'] == 'South Atlantic', 'FACILITY_LOCATION_CD'] = 'Atlantic'
data.loc[data['FACILITY_LOCATION_CD'] == 'East North Central', 'FACILITY_LOCATION_CD'] = 'Central'
data.loc[data['FACILITY_LOCATION_CD'] == 'East South Central', 'FACILITY_LOCATION_CD'] = 'Central'
data.loc[data['FACILITY_LOCATION_CD'] == 'West North Central', 'FACILITY_LOCATION_CD'] = 'Central'
data.loc[data['FACILITY_LOCATION_CD'] == 'West South Central', 'FACILITY_LOCATION_CD'] = 'Central'

data['FACILITY_LOCATION_CD'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'NO_HSD_QUAR_2016'.

data.loc[data['NO_HSD_QUAR_2016'] == 1, 'NO_HSD_QUAR_2016'] = '> 17.6%'
data.loc[data['NO_HSD_QUAR_2016'] == 2, 'NO_HSD_QUAR_2016'] = '10.9-17.6%'
data.loc[data['NO_HSD_QUAR_2016'] == 3, 'NO_HSD_QUAR_2016'] = '6.3-10.8%'
data.loc[data['NO_HSD_QUAR_2016'] == 4, 'NO_HSD_QUAR_2016'] = '< 6.3%'

data['NO_HSD_QUAR_2016'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'MED_INC_QUAR_2016'.

data.loc[data['MED_INC_QUAR_2016'] == 1, 'MED_INC_QUAR_2016'] = '< $40,227'
data.loc[data['MED_INC_QUAR_2016'] == 2, 'MED_INC_QUAR_2016'] = '$40,227-$50,353'
data.loc[data['MED_INC_QUAR_2016'] == 3, 'MED_INC_QUAR_2016'] = '$50,354-$63,333'
data.loc[data['MED_INC_QUAR_2016'] == 4, 'MED_INC_QUAR_2016'] = '> $63,333'

data['MED_INC_QUAR_2016'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'UR_CD_13'.

data.loc[data['UR_CD_13'] == 1, 'UR_CD_13'] = 'Counties in metro areas of 1 million population or more'
data.loc[data['UR_CD_13'] == 2, 'UR_CD_13'] = 'Counties in metro areas of 250,000 to 1 million population'
data.loc[data['UR_CD_13'] == 3, 'UR_CD_13'] = 'Counties in metro areas of fewer than 250,000 population'
data.loc[data['UR_CD_13'] == 4, 'UR_CD_13'] = 'Urban population of 20,000 or more adjacent to a metro area'
data.loc[data['UR_CD_13'] == 5, 'UR_CD_13'] = 'Urban population of 20,000 or more not adjacent to a metro area'
data.loc[data['UR_CD_13'] == 6, 'UR_CD_13'] = 'Urban population of 2,500 to 19,999, adjacent to a metro area'
data.loc[data['UR_CD_13'] == 7, 'UR_CD_13'] = 'Urban population of 2,500 to 19,999, not adjacent to a metro area'
data.loc[data['UR_CD_13'] == 8, 'UR_CD_13'] = 'Completely rural or less than 2,500 urban population, adjacent to a metro area'
data.loc[data['UR_CD_13'] == 9, 'UR_CD_13'] = 'Completely rural or less than 2,500 urban population, not adjacent to a metro area'

data['UR_CD_13'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'UR_CD_13'.

data.loc[data['UR_CD_13'] == 'Counties in metro areas of 1 million population or more', 'UR_CD_13'] = 'Metro'
data.loc[data['UR_CD_13'] == 'Counties in metro areas of 250,000 to 1 million population', 'UR_CD_13'] = 'Metro'
data.loc[data['UR_CD_13'] == 'Counties in metro areas of fewer than 250,000 population', 'UR_CD_13'] = 'Metro'
data.loc[data['UR_CD_13'] == 'Urban population of 20,000 or more adjacent to a metro area', 'UR_CD_13'] = 'Urban'
data.loc[data['UR_CD_13'] == 'Urban population of 20,000 or more not adjacent to a metro area', 'UR_CD_13'] = 'Urban'
data.loc[data['UR_CD_13'] == 'Urban population of 2,500 to 19,999, adjacent to a metro area', 'UR_CD_13'] = 'Urban'
data.loc[data['UR_CD_13'] == 'Urban population of 2,500 to 19,999, not adjacent to a metro area', 'UR_CD_13'] = 'Urban'
data.loc[data['UR_CD_13'] == 'Completely rural or less than 2,500 urban population, adjacent to a metro area', 'UR_CD_13'] = 'Rural'
data.loc[data['UR_CD_13'] == 'Completely rural or less than 2,500 urban population, not adjacent to a metro area', 'UR_CD_13'] = 'Rural'

data['UR_CD_13'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'CDCC_TOTAL_BEST'.

data.loc[data['CDCC_TOTAL_BEST'] == 0, 'CDCC_TOTAL_BEST'] = '0'
data.loc[data['CDCC_TOTAL_BEST'] == 1, 'CDCC_TOTAL_BEST'] = '1'
data.loc[data['CDCC_TOTAL_BEST'] == 2, 'CDCC_TOTAL_BEST'] = '2'
data.loc[data['CDCC_TOTAL_BEST'] == 3, 'CDCC_TOTAL_BEST'] = 'Greater than 3'

data['CDCC_TOTAL_BEST'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'CDCC_TOTAL_BEST'.

data.loc[data['CDCC_TOTAL_BEST'] == '0', 'CDCC_TOTAL_BEST'] = '0'
data.loc[data['CDCC_TOTAL_BEST'] == '1', 'CDCC_TOTAL_BEST'] = '1'
data.loc[data['CDCC_TOTAL_BEST'] == '2', 'CDCC_TOTAL_BEST'] = '>2'
data.loc[data['CDCC_TOTAL_BEST'] == 'Greater than 3', 'CDCC_TOTAL_BEST'] = '>2'

data['CDCC_TOTAL_BEST'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for TUMOR_SIZE_SUMMARY_2016.

data.loc[data['TUMOR_SIZE_SUMMARY_2016'] == 000, 'TUMOR_SIZE_SUMMARY_2016'] = np.nan
data.loc[data['TUMOR_SIZE_SUMMARY_2016'] >= 200, 'TUMOR_SIZE_SUMMARY_2016'] = np.nan

data['TUMOR_SIZE_SUMMARY_2016'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'METHYLATION_O6MGMT'.

data.loc[data['METHYLATION_O6MGMT'] == 0, 'METHYLATION_O6MGMT'] = 'Unmethylated'
data.loc[data['METHYLATION_O6MGMT'] == 1, 'METHYLATION_O6MGMT'] = 'Methylated'
data.loc[data['METHYLATION_O6MGMT'] == 2, 'METHYLATION_O6MGMT'] = 'Methylated'
data.loc[data['METHYLATION_O6MGMT'] == 3, 'METHYLATION_O6MGMT'] = 'Methylated'
data.loc[data['METHYLATION_O6MGMT'] == 6, 'METHYLATION_O6MGMT'] = np.nan
data.loc[data['METHYLATION_O6MGMT'] == 7, 'METHYLATION_O6MGMT'] = np.nan
data.loc[data['METHYLATION_O6MGMT'] == 8, 'METHYLATION_O6MGMT'] = np.nan
data.loc[data['METHYLATION_O6MGMT'] == 9, 'METHYLATION_O6MGMT'] = np.nan

data['METHYLATION_O6MGMT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'RX_SUMM_SURG_PRIM_SITE'.

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 0, 'RX_SUMM_SURG_PRIM_SITE'] = 'No surgery was performed'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 10, 'RX_SUMM_SURG_PRIM_SITE'] = 'Tumor destruction'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 20, 'RX_SUMM_SURG_PRIM_SITE'] = 'Biopsy'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 21, 'RX_SUMM_SURG_PRIM_SITE'] = 'Subtotal resection'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 22, 'RX_SUMM_SURG_PRIM_SITE'] = 'Resection of tumor of spinal cord or nerve'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 30, 'RX_SUMM_SURG_PRIM_SITE'] = 'Gross total resection'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 40, 'RX_SUMM_SURG_PRIM_SITE'] = 'Partial lobectomy'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 55, 'RX_SUMM_SURG_PRIM_SITE'] = 'Lobectomy'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 90, 'RX_SUMM_SURG_PRIM_SITE'] = 'Surgery was performed but extent of resection is unknown'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 99, 'RX_SUMM_SURG_PRIM_SITE'] = np.nan

data['RX_SUMM_SURG_PRIM_SITE'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'RX_SUMM_SURG_PRIM_SITE'.

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'No surgery was performed', 'RX_SUMM_SURG_PRIM_SITE'] = 'No resective surgery was performed'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Tumor destruction', 'RX_SUMM_SURG_PRIM_SITE'] = np.nan
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Biopsy', 'RX_SUMM_SURG_PRIM_SITE'] = 'No resective surgery was performed'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Subtotal resection', 'RX_SUMM_SURG_PRIM_SITE'] = 'Subtotal resection'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Resection of tumor of spinal cord or nerve', 'RX_SUMM_SURG_PRIM_SITE'] = np.nan
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Gross total resection', 'RX_SUMM_SURG_PRIM_SITE'] = 'Gross total resection'
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Partial lobectomy', 'RX_SUMM_SURG_PRIM_SITE'] = np.nan
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Lobectomy', 'RX_SUMM_SURG_PRIM_SITE'] = np.nan
data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'Surgery was performed but extent of resection is unknown', 'RX_SUMM_SURG_PRIM_SITE'] = np.nan

data['RX_SUMM_SURG_PRIM_SITE'].value_counts(normalize=False, dropna=False)

In [None]:
#Replace response values with descriptive strings for 'READM_HOSP_30_DAYS'.

data.loc[data['READM_HOSP_30_DAYS'] == 0, 'READM_HOSP_30_DAYS'] = 'No surgery was performed or the patient was not readmitted'
data.loc[data['READM_HOSP_30_DAYS'] == 1, 'READM_HOSP_30_DAYS'] = 'Unplanned readmission within 30 days of discharge'
data.loc[data['READM_HOSP_30_DAYS'] == 2, 'READM_HOSP_30_DAYS'] = 'Planned readmission within 30 days of discharge'
data.loc[data['READM_HOSP_30_DAYS'] == 3, 'READM_HOSP_30_DAYS'] = 'Both planned and unplanned readmissions within 30 days of discharge'
data.loc[data['READM_HOSP_30_DAYS'] == 9, 'READM_HOSP_30_DAYS'] = np.nan

data['READM_HOSP_30_DAYS'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'READM_HOSP_30_DAYS'.

data.loc[data['RX_SUMM_SURG_PRIM_SITE'] == 'No resective surgery was performed', 'READM_HOSP_30_DAYS'] = 'No surgery was performed'
data.loc[data['READM_HOSP_30_DAYS'] == 'No surgery was performed or the patient was not readmitted', 'READM_HOSP_30_DAYS'] = 'No'
data.loc[data['READM_HOSP_30_DAYS'] == 'Unplanned readmission within 30 days of discharge', 'READM_HOSP_30_DAYS'] = 'Yes'
data.loc[data['READM_HOSP_30_DAYS'] == 'Planned readmission within 30 days of discharge', 'READM_HOSP_30_DAYS'] = 'No'
data.loc[data['READM_HOSP_30_DAYS'] == 'Both planned and unplanned readmissions within 30 days of discharge', 'READM_HOSP_30_DAYS'] = 'Yes'

data['READM_HOSP_30_DAYS'].value_counts(normalize=False, dropna=False)

In [None]:
#See lengths of stay after surgery.

data['SURG_DISCHARGE_DAYS'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'NUMBER_PHASES_RAD_RX'.

data.loc[data['NUMBER_PHASES_RAD_RX'] == 0, 'NUMBER_PHASES_RAD_RX'] = 'No Radiotherapy'
data.loc[data['NUMBER_PHASES_RAD_RX'] == 1, 'NUMBER_PHASES_RAD_RX'] = '1 Phase'
data.loc[data['NUMBER_PHASES_RAD_RX'] == 2, 'NUMBER_PHASES_RAD_RX'] = '2 Phases'
data.loc[data['NUMBER_PHASES_RAD_RX'] == 3, 'NUMBER_PHASES_RAD_RX'] = '3 Phases'
data.loc[data['NUMBER_PHASES_RAD_RX'] == 4, 'NUMBER_PHASES_RAD_RX'] = '4 or more Phases'
data.loc[data['NUMBER_PHASES_RAD_RX'] == 99, 'NUMBER_PHASES_RAD_RX'] = np.nan

data['NUMBER_PHASES_RAD_RX'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'NUMBER_PHASES_RAD_RX'.

data.loc[data['NUMBER_PHASES_RAD_RX'] == 'No Radiotherapy', 'NUMBER_PHASES_RAD_RX'] = 'No'
data.loc[data['NUMBER_PHASES_RAD_RX'] == '1 Phase', 'NUMBER_PHASES_RAD_RX'] = 'Yes'
data.loc[data['NUMBER_PHASES_RAD_RX'] == '2 Phases', 'NUMBER_PHASES_RAD_RX'] = 'Yes'
data.loc[data['NUMBER_PHASES_RAD_RX'] == '3 Phases', 'NUMBER_PHASES_RAD_RX'] = 'Yes'
data.loc[data['NUMBER_PHASES_RAD_RX'] == '4 or more Phases', 'NUMBER_PHASES_RAD_RX'] = 'Yes'

data['NUMBER_PHASES_RAD_RX'].value_counts(normalize=False, dropna=False)

In [None]:
#Replace response values with descriptive strings for 'RX_SUMM_CHEMO'.

data.loc[data['RX_SUMM_CHEMO'] == 0, 'RX_SUMM_CHEMO'] = 'None, chemotherapy was not part of the planned first course of therapy'
data.loc[data['RX_SUMM_CHEMO'] == 1, 'RX_SUMM_CHEMO'] = 'Chemotherapy administered as first course therapy, but the type and number of agents is not documented in patient record'
data.loc[data['RX_SUMM_CHEMO'] == 2, 'RX_SUMM_CHEMO'] = 'Single-agent chemotherapy administered as first course therapy'
data.loc[data['RX_SUMM_CHEMO'] == 3, 'RX_SUMM_CHEMO'] = 'Multiagent chemotherapy administered as first course therapy'
data.loc[data['RX_SUMM_CHEMO'] == 82, 'RX_SUMM_CHEMO'] = 'Chemotherapy was not recommended/administered because it was contraindicated due to patient risk factors'
data.loc[data['RX_SUMM_CHEMO'] == 85, 'RX_SUMM_CHEMO'] = 'Chemotherapy was not administered because the patient died prior to planned or recommended therapy'
data.loc[data['RX_SUMM_CHEMO'] == 86, 'RX_SUMM_CHEMO'] = 'Chemotherapy was was recommended but was not administered as part of the first course of therapy'
data.loc[data['RX_SUMM_CHEMO'] == 87, 'RX_SUMM_CHEMO'] = 'Chemotherapy was recommended by the physician, but was refused by the patient, a family member, or guardian'
data.loc[data['RX_SUMM_CHEMO'] == 88, 'RX_SUMM_CHEMO'] = np.nan
data.loc[data['RX_SUMM_CHEMO'] == 99, 'RX_SUMM_CHEMO'] = np.nan

data['RX_SUMM_CHEMO'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'RX_SUMM_CHEMO'.

data.loc[data['RX_SUMM_CHEMO'] == 'None, chemotherapy was not part of the planned first course of therapy', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy administered as first course therapy, but the type and number of agents is not documented in patient record', 'RX_SUMM_CHEMO'] = 'Yes'
data.loc[data['RX_SUMM_CHEMO'] == 'Single-agent chemotherapy administered as first course therapy', 'RX_SUMM_CHEMO'] = 'Yes'
data.loc[data['RX_SUMM_CHEMO'] == 'Multiagent chemotherapy administered as first course therapy', 'RX_SUMM_CHEMO'] = 'Yes'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was not recommended/administered because it was contraindicated due to patient risk factors', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was not administered because the patient died prior to planned or recommended therapy', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was was recommended but was not administered as part of the first course of therapy', 'RX_SUMM_CHEMO'] = 'No'
data.loc[data['RX_SUMM_CHEMO'] == 'Chemotherapy was recommended by the physician, but was refused by the patient, a family member, or guardian', 'RX_SUMM_CHEMO'] = 'No'

data['RX_SUMM_CHEMO'].value_counts(normalize=False, dropna=False)

In [None]:
#Replace response values with descriptive strings for 'RX_SUMM_IMMUNOTHERAPY'.

data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 0, 'RX_SUMM_IMMUNOTHERAPY'] = 'None, immunotherapy was not part of the planned first course of therapy'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 1, 'RX_SUMM_IMMUNOTHERAPY'] = 'Immunotherapy administered as first course therapy, but the type and number of agents is not documented in patient record'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 82, 'RX_SUMM_IMMUNOTHERAPY'] = 'Immunotherapy was not recommended/administered because it was contraindicated due to patient risk factors'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 85, 'RX_SUMM_IMMUNOTHERAPY'] = 'Immunotherapy was not administered because the patient died prior to planned or recommended therapy'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 86, 'RX_SUMM_IMMUNOTHERAPY'] = 'Immunotherapy was was recommended but was not administered as part of the first course of therapy'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 87, 'RX_SUMM_IMMUNOTHERAPY'] = 'Immunotherapy was recommended by the physician, but was refused by the patient, a family member, or guardian'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 88, 'RX_SUMM_IMMUNOTHERAPY'] = np.nan
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 99, 'RX_SUMM_IMMUNOTHERAPY'] = np.nan

data['RX_SUMM_IMMUNOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Simplify 'RX_SUMM_CHEMO'.

data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'None, immunotherapy was not part of the planned first course of therapy', 'RX_SUMM_IMMUNOTHERAPY'] = 'No'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'Immunotherapy administered as first course therapy, but the type and number of agents is not documented in patient record', 'RX_SUMM_IMMUNOTHERAPY'] = 'Yes'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'Immunotherapy was not recommended/administered because it was contraindicated due to patient risk factors', 'RX_SUMM_IMMUNOTHERAPY'] = 'No'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'Immunotherapy was not administered because the patient died prior to planned or recommended therapy', 'RX_SUMM_IMMUNOTHERAPY'] = 'No'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'Immunotherapy was was recommended but was not administered as part of the first course of therapy', 'RX_SUMM_IMMUNOTHERAPY'] = 'No'
data.loc[data['RX_SUMM_IMMUNOTHERAPY'] == 'Immunotherapy was recommended by the physician, but was refused by the patient, a family member, or guardian', 'RX_SUMM_IMMUNOTHERAPY'] = 'No'

data['RX_SUMM_IMMUNOTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PUF_VITAL_STATUS'.

data.loc[data['PUF_VITAL_STATUS'] == 0, 'PUF_VITAL_STATUS'] = 'Dead'
data.loc[data['PUF_VITAL_STATUS'] == 1, 'PUF_VITAL_STATUS'] = 'Alive'

data['PUF_VITAL_STATUS'].value_counts(normalize=False, dropna=False)

In [None]:
#Save labeled data.

data.to_csv('/content/drive/MyDrive/NCDB-GBM/raw_data.csv')