In [1]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# load dataset from SHRS_smallDS.xlsx
data = pd.read_excel('SHRS_smallDS.xlsx',index_col=False)

# clean dataset
missing_values = data.isnull().sum()
df_dropped = data.dropna()
data['Survival months'] = data['Survival months'].replace('Unknown', 0).astype(int)

# filter the dataset
def Survival(row):
    if (row['Survival months'] >= 60) and (row['Vital status recode (study cutoff used)'] == 'Alive'):
        return 1
    elif (row['Survival months'] < 60) and (row['SEER cause-specific death classification'] == 'Dead (attributable to this cancer dx)'):
        return 0
    else:
        data.drop(index=row.name, inplace=True)

data['Survived'] = data.apply(Survival, axis=1)

import re

# get the first two interger from a string
def get_first_two_ints(s):
    numbers = re.findall(r'\d+', s)
    return [int(num) for num in numbers[:2]]

# convert to numerical form
data['Age'] = None
data['Income'] = None
data['Tumor Size'] = None
data['Grade Recode'] = None

GR = {'Unknown': 0,'Undifferentiated; anaplastic; Grade IV': 4, 'Poorly differentiated; Grade III': 3, 'Moderately differentiated; Grade II': 2, 'Well differentiated; Grade I': 1}

for index, row in data.iterrows():
    data.at[index, 'Age'] = data.at[index, 'Age recode with <1 year olds'][:2]
    data.at[index, 'Income'] = get_first_two_ints(data.at[index, 'Median household income inflation adj to 2021'])[0]
    if data.at[index, 'CS Tumor Size/Ext Eval (2004-2015)'] == 'Blank(s)':
        data.at[index, 'Tumor Size'] = 0
    else:
        data.at[index, 'Tumor Size'] = data.at[index, 'CS Tumor Size/Ext Eval (2004-2015)']
    data.at[index, 'Grade Recode'] = GR[data.at[index, 'Grade Recode (thru 2017)']]
    
data['Survived'] = data['Survived'].astype(int)
data['Age'] = data['Age'].astype(int)
data['Income'] = data['Income'].astype(int)
data['Tumor Size'] = data['Tumor Size'].astype(int)
data['Grade Recode'] = data['Grade Recode'].astype(int)
data.head

<bound method NDFrame.head of       Site recode ICD-O-3/WHO 2008 Behavior code ICD-O-3  Primary Site  \
1                           Rectum             Malignant           209   
2            Rectosigmoid Junction             Malignant           199   
3                  Ascending Colon             Malignant           182   
4                Lung and Bronchus             Malignant           343   
5                           Breast             Malignant           506   
...                            ...                   ...           ...   
29993            Lung and Bronchus             Malignant           342   
29995                  Gallbladder             Malignant           239   
29997            Lung and Bronchus             Malignant           349   
29998            Lung and Bronchus             Malignant           341   
29999                 Corpus Uteri             Malignant           549   

       Histologic Type ICD-O-3                Grade Recode (thru 2017)  \
1      

In [2]:
# make a new data frame and store the useful attributes
final_data = data[['Site recode ICD-O-3/WHO 2008','Race recode (White, Black, Other)',
           'Sex',
           'Histologic Type ICD-O-3',
           'Regional nodes examined (1988+)',
           'Regional nodes positive (1988+)',
           'Primary Site',
           'Marital status at diagnosis',
           'Survived',
           'Age',
           'Income',
           'Tumor Size',
           'Grade Recode']]

# Initialize a dictionary to store the mappings
category_mappings = {}

# Manually encode each categorical column to numerical values
for column in final_data.select_dtypes(include=['object']):
    # Create a mapping for the current column
    categories = final_data[column].unique()
    mapping = {category: index for index, category in enumerate(categories)}
    category_mappings[column] = mapping

    # Replace the categorical values with numerical values
    final_data[column] = final_data[column].replace(mapping)

print("\nCategory Mappings:")
for column, mapping in category_mappings.items():
    print(f"{column}: {mapping}")


Category Mappings:
Site recode ICD-O-3/WHO 2008: {'Rectum': 0, 'Rectosigmoid Junction': 1, 'Ascending Colon': 2, 'Lung and Bronchus': 3, 'Breast': 4, 'Cecum': 5, 'Corpus Uteri': 6, 'Esophagus': 7, 'Sigmoid Colon': 8, 'Gallbladder': 9, 'Stomach': 10, 'Liver': 11, 'Hepatic Flexure': 12, 'Prostate': 13, 'Cervix Uteri': 14, 'Appendix': 15, 'Splenic Flexure': 16, 'Descending Colon': 17, 'Transverse Colon': 18, 'Large Intestine, NOS': 19}
Race recode (White, Black, Other): {'White': 0, 'Black': 1, 'Other (American Indian/AK Native, Asian/Pacific Islander)': 2}
Sex: {'Male': 0, 'Female': 1}
Marital status at diagnosis: {'Single (never married)': 0, 'Widowed': 1, 'Married (including common law)': 2, 'Divorced': 3, 'Separated': 4}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data[column] = final_data[column].replace(mapping)


In [3]:
final_data[:5]

Unnamed: 0,Site recode ICD-O-3/WHO 2008,"Race recode (White, Black, Other)",Sex,Histologic Type ICD-O-3,Regional nodes examined (1988+),Regional nodes positive (1988+),Primary Site,Marital status at diagnosis,Survived,Age,Income,Tumor Size,Grade Recode
1,0,0,0,8140,10,0,209,0,0,65,75,0,1
2,1,0,1,8140,8,1,199,0,1,35,75,3,2
3,2,1,0,8140,14,1,182,1,0,60,75,0,2
4,3,0,1,8012,1,1,343,2,0,35,75,0,1
5,4,0,1,8520,0,98,506,3,0,75,75,0,2


In [4]:
cancer_counts = final_data['Site recode ICD-O-3/WHO 2008'].value_counts()

# Get the top five most numerous cancer types
top_five_cancers = cancer_counts.head(5)

print(top_five_cancers)

Site recode ICD-O-3/WHO 2008
3     5496
4     3242
13    1045
10     689
6      499
Name: count, dtype: int64


In [8]:
# Select the cancer type
cancer_type = 10

cancer_type_df = final_data[final_data['Site recode ICD-O-3/WHO 2008'] == cancer_type]
cancer_type_df.shape

(689, 13)

In [9]:
# cancer_type_stage_df = cancer_type_df[cancer_type_df['Grade Recode'] == 4]
# cancer_type_stage_df.head

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from aif360.sklearn.inprocessing import ExponentiatedGradientReduction

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

stages = [1, 2, 3, 4]
model_types = ['Biased', 'Unbiased']
lr_accuracies = []
dt_accuracies = []
ada_accuracies = []
xgb_accuracies = []

# Loop through each stage
for stage in stages:
    cancer_type_stage_df = cancer_type_df[cancer_type_df['Grade Recode'] == stage]
    # separate array into input and output components
    X = cancer_type_stage_df.drop(['Survived', 'Site recode ICD-O-3/WHO 2008', 'Grade Recode'], axis=1)
    y = cancer_type_stage_df['Survived']

    (X_train, X_test,
     y_train, y_test) = train_test_split(X, y, train_size=0.8, random_state=42)
    
    prot_attr_cols = [colname for colname in X_train 
                  if "Sex" in colname or "Marital status at diagnosis" in colname 
                  or "Race recode (White, Black, Other)" in colname or "Income" in colname]
    print(cancer_type_stage_df.shape)
    
    for model_type in model_types:
        # Train model based on type
        if model_type == 'Biased':
            # Logistic Regression
            lr_model = LogisticRegression(solver='liblinear')
            lr_model.fit(X_train, y_train)
            
            # Decision Tree
            dt_model = DecisionTreeClassifier()
            dt_model.fit(X_train, y_train)
            
#             # AdaBoost
#             ada_model = AdaBoostClassifier()
#             ada_model.fit(X_train, y_train)
            
#             # XGBoost
#             xgb_model = XGBClassifier()
#             xgb_model.fit(X_train, y_train)
            
        else:
            # Logistic Regresion
            lr_estimator = LogisticRegression(solver='liblinear')
            lr_model = ExponentiatedGradientReduction(prot_attr=prot_attr_cols,
                                                             estimator=lr_estimator, 
                                                             constraints="EqualizedOdds",
                                                             drop_prot_attr=False)
            lr_model.fit(X_train, y_train)
            
            # Decision Tree
            dt_estimator = DecisionTreeClassifier()
            dt_model = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                                          estimator=dt_estimator, 
                                                          constraints="EqualizedOdds",
                                                          drop_prot_attr=False)
            dt_model.fit(X_train, y_train)
            
#             # AdaBoost
#             ada_estimator = AdaBoostClassifier()
#             ada_model = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
#                                                           estimator=ada_estimator, 
#                                                           constraints="EqualizedOdds",
#                                                           drop_prot_attr=False)
#             ada_model.fit(X_train, y_train)
            
#             # XGBoost
#             xgb_estimator = XGBClassifier()
#             xgb_model = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
#                                                           estimator=xgb_estimator, 
#                                                           constraints="EqualizedOdds",
#                                                           drop_prot_attr=False)
#             xgb_model.fit(X_train, y_train)
            
        # Predict and calculate accuracy
        # LR
        lr_predictions = lr_model.predict(X_test)
        lr_accuracy = accuracy_score(y_test, lr_predictions)
        
        # DT
        dt_predictions =dt_model.predict(X_test)
        dt_accuracy = accuracy_score(y_test, dt_predictions)
        
#         # AdaBoost
#         ada_predictions =ada_model.predict(X_test)
#         ada_accuracy = accuracy_score(y_test, ada_predictions)
        
#         # XGBoost
#         xgb_predictions =xgb_model.predict(X_test)
#         xgb_accuracy = accuracy_score(y_test, xgb_predictions)
        
        # Append to accuracies list
        lr_accuracies.append({
            'Cancer Stage': stage,
            'Model Type': model_type,
            'Accuracy': lr_accuracy
        })
        
        dt_accuracies.append({
            'Cancer Stage': stage,
            'Model Type': model_type,
            'Accuracy': dt_accuracy
        })
        
#         ada_accuracies.append({
#             'Cancer Stage': stage,
#             'Model Type': model_type,
#             'Accuracy': ada_accuracy
#         })
        
#         xgb_accuracies.append({
#             'Cancer Stage': stage,
#             'Model Type': model_type,
#             'Accuracy': xgb_accuracy
#         })

# Create DataFrame
lr_df = pd.DataFrame(lr_accuracies)
dt_df = pd.DataFrame(dt_accuracies)
# ada_df = pd.DataFrame(ada_accuracies)
# xgb_df = pd.DataFrame(xgb_accuracies)



# Display the DataFrame
# print(lr_df)
# print(dt_df)
# print(ada_df)
# print(xgb_df)

(15, 13)


In [None]:
from tabulate import tabulate

# Convert the DataFrame to a format suitable for tabulate
print('Logistic Regression')
print(tabulate(lr_df, headers='keys', tablefmt='pretty', showindex=False))
print('Decision Tree')
print(tabulate(dt_df, headers='keys', tablefmt='pretty', showindex=False))
# print('AdaBoost')
# print(tabulate(ada_df, headers='keys', tablefmt='pretty', showindex=False))
# print('XGBoost')
# print(tabulate(xgb_df, headers='keys', tablefmt='pretty', showindex=False))