In [None]:
# Import the data, with missing values represented by NaN
import pandas as pd
import numpy as np

df_os = pd.read_excel('seerdata4paper2os.xlsx') 
df_css = pd.read_excel('seerdata4paper2css.xlsx') 

# Remove unused columns
df_os = df_os.drop(columns=['Unnamed: 0','Survival_months','COD to site recode ICD-O-3 2023 Revision','COD'])
df_css = df_css.drop(columns=['Unnamed: 0','Survival_months','COD to site recode ICD-O-3 2023 Revision','COD'])

# Set categorical features' data as category
nan_cols = ['Sex', 'Race',
       'Marital_status_at_diagnosis', 'Tumor_location', 'Tumor_grade',
       'Tumor_size', 'AJCC_Stage', 'Mitotic_rate', 'Surgery',
       'Regional_nodes_examined', 'Chemotherapy']
for col in nan_cols:
    df_os[col] = df_os[col].astype('category')
    df_css[col] = df_css[col].astype('category')

# Categories are mapped to numbers - append 0 to the file suffix
df_os0 = df_os.copy(deep=True)
df_css0 = df_css.copy(deep=True)
for data in [df_os0,df_css0]:
    data['Sex']=data['Sex'].map({'Female':0,'Male':1})
    data['Race']=data['Race'].map({'others':0,'White':1,'Black':2})
    data['Marital_status_at_diagnosis']=data['Marital_status_at_diagnosis'].map({'Single':0,'Married':1})
    data['Tumor_location']=data['Tumor_location'].map({'Cardia':0,'Fundus':0,'Body':1,'Antrum and Pylorus':2})
    data['Tumor_grade']=data['Tumor_grade'].map({'Well/moderately differentiated':0,'Poorly differentiated/undifferentiated':1})
    data['Tumor_size']=data['Tumor_size'].map({'≤2 cm':0,'2-5cm':1,'5-10cm':2,'>10cm':3})
    data['AJCC_Stage']=data['AJCC_Stage'].map({'Ⅰ':0,'Ⅱ':1,'Ⅲ':2,'Ⅳ':3})
    data['Mitotic_rate']=data['Mitotic_rate'].map({'≤5/5mm2 HPF':0,'>5/5mm2 HP':1})
    data['Surgery']=data['Surgery'].map({'No Surgery':0,'Local excision':1,'Radical excision':2})
    data['Regional_nodes_examined']=data['Regional_nodes_examined'].map({0:0,'1-4':1,'>4':2})
    data['Chemotherapy']=data['Chemotherapy'].map({'No/Unknown':0,'Yes':1})

# The Tumor_location category has been changed and needs to be reset as category
col = 'Tumor_location'
df_os0[col] = df_os0[col].astype('category')
df_css0[col] = df_css0[col].astype('category')
df_css0.dtypes

In [None]:
# Use MICE with catboost for missing value imputation, make sure to import misscatboosts first
# https://github.com/llyong/MissCatboosts
from misscatboosts.misscatboosts import MissCatboosts

mc = MissCatboosts()
data_imputed = mc.fit_transform(
    X=df_css0,
    categorical=["sex", "Race", "Marital_status_at_diagnosis","Tumor_location","Tumor_grade",
                 "Tumor_size","AJCC_Stage","Mitotic_rate","Surgery","Regional_nodes_examined","Chemotherapy"]
)


In [None]:
# Convert numerical variables to categorical
column_names = ['Age_at_diagnosis', 'Sex', 'Race', 'Marital_status_at_diagnosis',
       'Tumor_location', 'Tumor_grade', 'Tumor_size', 'AJCC_Stage',
       'Mitotic_rate', 'Surgery', 'Regional_nodes_examined', 'Chemotherapy'] 
df_css0_complete =  pd.DataFrame(data_imputed,columns = column_names)


for data in [df_css0_complete]:
    data['Sex']=data['Sex'].map({0:'Female',1:'Male'})
    data['Race']=data['Race'].map({0:'others',1:'White',2:'Black'})
    data['Marital_status_at_diagnosis']=data['Marital_status_at_diagnosis'].map({0:'Single',1:'Married'})
    data['Tumor_location']=data['Tumor_location'].map({0:'Cardia_Fundus',1:'Body',2:'Antrum_Pylorus'})
    data['Tumor_grade']=data['Tumor_grade'].map({0:'Well_moderately_differentiated',1:'Poorly_differentiated_undifferentiated'})
    data['Tumor_size']=data['Tumor_size'].map({0:'smaller_2cm',1:'2_5cm',2:'5_10cm',3:'bigger_10cm'})
    data['AJCC_Stage']=data['AJCC_Stage'].map({0:1,1:2,2:3,3:4})
    data['Mitotic_rate']=data['Mitotic_rate'].map({0:'smaller_5HPF',1:'bigger_5HPF'})
    data['Surgery']=data['Surgery'].map({0:'NoSurgery',1:'Local_excision',2:'Radical_excision'})
    data['Regional_nodes_examined']=data['Regional_nodes_examined'].map({0:0,1:'1to4',2:'bigger_4'})
    data['Chemotherapy']=data['Chemotherapy'].map({0:'No_Unknown',1:'Yes'})

df_css0_complete['Survival_months'] = df_css['Survival_months']
df_css0_complete['COD'] = df_css['COD']
df_css0_complete.to_csv('df_css0_complete_11.csv',index=False) 
