In [25]:
import matplotlib.pyplot as plt
import os
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import metrics, preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import numpy as np
import pandas as pd
import sklearn as sklearn

CSV File

In [3]:
data = pd.read_csv("bcfile.csv")

In [4]:
df = pd.DataFrame(data)

Clean code

In [9]:
df.dropna(inplace=True)

In [10]:
# drop na seems to have removed 'Blanks' rows
df = df[~df['Breast - Adjusted AJCC 6th T (1988-2015)'].str.contains('Blank\(s\)')]

In [11]:
# drop na seems to have removed 'Unknown' rows
df = df[~df['Survival months'].str.contains('Unknown')]

In [12]:
race_encode = 'Race recode (W, B, AI, API)'

race_encoded = pd.get_dummies(df[race_encode], prefix = race_encode)

df = pd.concat([df, race_encoded], axis=1)

df.drop('Race recode (W, B, AI, API)', axis=1, inplace=True)

In [13]:
sixthstage_encode = 'Breast - Adjusted AJCC 6th T (1988-2015)'

sixthstage_encoded = pd.get_dummies(df[sixthstage_encode], prefix = sixthstage_encode)

df = pd.concat([df, sixthstage_encoded], axis=1)

df.drop('Breast - Adjusted AJCC 6th T (1988-2015)', axis=1, inplace=True)

In [14]:
nstage_encode = 'Breast - Adjusted AJCC 6th N (1988-2015)'

nstage_encoded = pd.get_dummies(df[nstage_encode], prefix = nstage_encode)

df = pd.concat([df, nstage_encoded], axis=1)

df.drop('Breast - Adjusted AJCC 6th N (1988-2015)', axis=1, inplace=True)

In [15]:
mstage_encode = 'Breast - Adjusted AJCC 6th M (1988-2015)'

mstage_encoded = pd.get_dummies(df[mstage_encode], prefix = mstage_encode)

df = pd.concat([df, mstage_encoded], axis=1)

df.drop('Breast - Adjusted AJCC 6th M (1988-2015)', axis=1, inplace=True)

In [16]:
# instead of encoding age, remove "years" from value and change it to int type
df['Age recode with single ages and 85+'] = df['Age recode with single ages and 85+'].str.replace(' years', '')

# # there is a value with 85+ years, so replace '85+' with 85
df['Age recode with single ages and 85+'] = df['Age recode with single ages and 85+'].replace('85+', '85')

# convert 'Age recode' column to int
df['Age recode with single ages and 85+'] = df['Age recode with single ages and 85+'].astype(int)

In [17]:
income_encode = 'Median household income inflation adj to 2019'

income_encoded = pd.get_dummies(df[income_encode], prefix = income_encode)

df = pd.concat([df, income_encoded], axis=1)

df.drop('Median household income inflation adj to 2019', axis=1, inplace=True)

In [18]:
df['Survival months'] = df['Survival months'].astype(int)

Using LazyPredict

In [31]:
from lazypredict.Supervised import LazyRegressor

Decision Trees

In [None]:
X = df.drop(columns=['Survival months'])
y = df['Survival months']

Good for categorical, so maybe good for stage classification?

In [22]:
# Concatenate numerical and one-hot encoded categorical features
X = pd.concat([income_encoded], axis=1)
y = df['Survival months']

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [32]:
# fit all models
reg = LazyRegressor(predictions=True)
models, predictions = reg.fit(X_train, X_test, Y_train, Y_test)

 62%|██████▏   | 26/42 [2:15:27<46:25, 174.07s/it]    

In [29]:
# Create the decision tree regression model
tree_reg = DecisionTreeRegressor(max_depth=5)
tree_reg.fit(X_train, Y_train)

# Make predictions on the testing set
y_pred = tree_reg.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)
# rmse = np.sqrt(mse)
print('MSE:', mse)
print('R2:', r2)

MSE: 3647.70318240394
R2: 0.013105469737920594
