In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [2]:
# set seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [3]:
# read the csv file into a pandas DataFrame, drop id column
df = pd.read_csv("Resources/Data/ThoracicSurgery.csv")
df = df.drop("id", axis=1)

In [4]:
# set data for classifier 
target = df["Risk1Yr"]
target_names = ["T", "F"]
data = df.drop("Risk1Yr", axis=1)
data = pd.get_dummies(data)
feature_names = data.columns

In [5]:
# split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=1)

In [6]:
# create and score random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8617021276595744

In [7]:
# sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True) 

[(0.18112493347465042, 'PRE5'),
 (0.17134144561789622, 'PRE4'),
 (0.159356351562549, 'AGE'),
 (0.03002138278667311, 'DGN_DGN5'),
 (0.027762292765678532, 'PRE14_OC11'),
 (0.0267437945513764, 'DGN_DGN3'),
 (0.025202968373976894, 'PRE14_OC12'),
 (0.023257866712346614, 'PRE14_OC14'),
 (0.02244120352880268, 'DGN_DGN2'),
 (0.02137556421800881, 'PRE11_F'),
 (0.02055635109184197, 'PRE11_T'),
 (0.02047555773552332, 'PRE6_PRZ1'),
 (0.019714743869018902, 'PRE8_T'),
 (0.01939140105620525, 'PRE17_T'),
 (0.019262892924332534, 'PRE8_F'),
 (0.01775123542926093, 'PRE14_OC13'),
 (0.016005591991755153, 'PRE17_F'),
 (0.01587028845481526, 'PRE10_F'),
 (0.015693540981493087, 'PRE7_T'),
 (0.01523623043631555, 'PRE9_T'),
 (0.015185140501332177, 'PRE10_T'),
 (0.015088366391897005, 'PRE6_PRZ0'),
 (0.013996162206847072, 'PRE7_F'),
 (0.013863661637729941, 'PRE30_F'),
 (0.013675864997954838, 'PRE30_T'),
 (0.013226252982040509, 'PRE9_F'),
 (0.013160973805714652, 'DGN_DGN4'),
 (0.010295785950856968, 'DGN_DGN8'),
 (0

In [8]:
# establish X and y
X = data
y = target

In [9]:
# create train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [10]:
# scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# label-encode target data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

NameError: name 'to_categorical' is not defined

In [None]:
# create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=37))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
# compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
# evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
#  make predictions
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

In [None]:
# create the SVC Model
from sklearn.svm import SVC 

model = SVC(kernel='linear')

In [None]:
SVC(kernel='linear')

In [None]:
# create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# train the model with GridSearch
grid.fit(X_train_scaled, y_train)

In [None]:
 # fit the model using the grid search estimator 
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)
 # list the best parameters, score for this dataset
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# make predictions with hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

In [None]:
# list the best score
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

In [None]:
# f1 score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import f1_score
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))

In [None]:
# calculate classification report
print(classification_report(y_test, predictions,
                            target_names=["T", "F"]))

# Data Pre-Processing
### Data Cleaning

In [None]:
new_df = df[['DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE', 'Risk1Yr']]
new_df.head()

In [None]:
columns = [
           "DGN",
           "PRE4",
           "PRE5",
           "PRE6",
           "PRE7",
           "PRE8",
           "PRE9",
           "PRE10",
           "PRE11",
           "PRE14",
           "PRE17",
           "PRE19",
           "PRE25",
           "PRE30",
           "PRE32",
           "AGE",
           "Risk1Yr",
]

lived_df = new_df.loc[new_df["Risk1Yr"] == "F", columns ]
lived_df

In [None]:
# counts
pre7_df = lived_df["PRE7"].value_counts()
pre8_df = lived_df["PRE8"].value_counts()
pre9_df = lived_df["PRE9"].value_counts()
pre10_df = lived_df["PRE10"].value_counts()
pre11_df = lived_df["PRE11"].value_counts()
pre14_df = lived_df["PRE14"].value_counts()
pre17_df = lived_df["PRE17"].value_counts()
pre19_df = lived_df["PRE19"].value_counts()
pre25_df = lived_df["PRE25"].value_counts()
pre30_df = lived_df["PRE30"].value_counts()
pre32_df = lived_df["PRE32"].value_counts()

print(pre7_df)
print(f"-------------------------")
print(pre8_df)
print(f"-------------------------")
print(pre9_df)
print(f"-------------------------")
print(pre10_df)
print(f"-------------------------")
print(pre11_df)
print(f"-------------------------")
print(pre14_df)
print(f"-------------------------")
print(pre17_df)
print(f"-------------------------")
print(pre19_df)
print(f"-------------------------")
print(pre25_df)
print(f"-------------------------")
print(pre30_df)
print(f"-------------------------")
print(pre32_df)

In [None]:
DGN_df = lived_df["DGN"].value_counts()
DGN_df

In [None]:
# Cleaning dataframe - adjusting string values into numeric processible values
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN1', '1', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN2', '2', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN3', '3', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN4', '4', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN5', '5', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN6', '6', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN8', '8', new_df['DGN'] )
new_df['PRE6'] = np.where(new_df['PRE6'] == 'PRZ2', '2', new_df['PRE6'] )
new_df['PRE6'] = np.where(new_df['PRE6'] == 'PRZ1', '1', new_df['PRE6'] )
new_df['PRE6'] = np.where(new_df['PRE6'] == 'PRZ0', '0', new_df['PRE6'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC11', '11', new_df['PRE14'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC12', '12', new_df['PRE14'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC13', '13', new_df['PRE14'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC14', '14', new_df['PRE14'] )
new_df.head()

In [None]:
# Data Cleaning - copying dataframe to allow other adjustments for later
cleaned_df = new_df

In [None]:
cleaned_df['PRE7'] = np.where(cleaned_df['PRE7'] == 'T', '0', cleaned_df['PRE7'] )
cleaned_df['PRE7'] = np.where(cleaned_df['PRE7'] == 'F', '1', cleaned_df['PRE7'] )
cleaned_df['PRE8'] = np.where(cleaned_df['PRE8'] == 'T', '0', cleaned_df['PRE8'] )
cleaned_df['PRE8'] = np.where(cleaned_df['PRE8'] == 'F', '1', cleaned_df['PRE8'] )
cleaned_df['PRE9'] = np.where(cleaned_df['PRE9'] == 'T', '0', cleaned_df['PRE9'] )
cleaned_df['PRE9'] = np.where(cleaned_df['PRE9'] == 'F', '1', cleaned_df['PRE9'] )
cleaned_df['PRE10'] = np.where(cleaned_df['PRE10'] == 'T', '0', cleaned_df['PRE10'] )
cleaned_df['PRE10'] = np.where(cleaned_df['PRE10'] == 'F', '1', cleaned_df['PRE10'] )
cleaned_df['PRE11'] = np.where(cleaned_df['PRE11'] == 'T', '0', cleaned_df['PRE11'] )
cleaned_df['PRE11'] = np.where(cleaned_df['PRE11'] == 'F', '1', cleaned_df['PRE11'] )
cleaned_df['PRE17'] = np.where(cleaned_df['PRE17'] == 'T', '0', cleaned_df['PRE17'] )
cleaned_df['PRE17'] = np.where(cleaned_df['PRE17'] == 'F', '1', cleaned_df['PRE17'] )
cleaned_df['PRE19'] = np.where(cleaned_df['PRE19'] == 'T', '0', cleaned_df['PRE19'] )
cleaned_df['PRE19'] = np.where(cleaned_df['PRE19'] == 'F', '1', cleaned_df['PRE19'] )
cleaned_df['PRE25'] = np.where(cleaned_df['PRE25'] == 'T', '0', cleaned_df['PRE25'] )
cleaned_df['PRE25'] = np.where(cleaned_df['PRE25'] == 'F', '1', cleaned_df['PRE25'] )
cleaned_df['PRE30'] = np.where(cleaned_df['PRE30'] == 'T', '0', cleaned_df['PRE30'] )
cleaned_df['PRE30'] = np.where(cleaned_df['PRE30'] == 'F', '1', cleaned_df['PRE30'] )
cleaned_df['PRE32'] = np.where(cleaned_df['PRE32'] == 'T', '0', cleaned_df['PRE32'] )
cleaned_df['PRE32'] = np.where(cleaned_df['PRE32'] == 'F', '1', cleaned_df['PRE32'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'T', '0', cleaned_df['Risk1Yr'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'F', '1', cleaned_df['Risk1Yr'] )
cleaned_df.head()

In [None]:
cleaned_df['DGN'] = cleaned_df['DGN'].astype(int)
cleaned_df['PRE6'] = cleaned_df['PRE6'].astype(int)
cleaned_df['PRE7'] = cleaned_df['PRE7'].astype(int)
cleaned_df['PRE8'] = cleaned_df['PRE8'].astype(int)
cleaned_df['PRE9'] = cleaned_df['PRE9'].astype(int)
cleaned_df['PRE10'] = cleaned_df['PRE10'].astype(int)
cleaned_df['PRE11'] = cleaned_df['PRE11'].astype(int)
cleaned_df['PRE14'] = cleaned_df['PRE14'].astype(int)
cleaned_df['PRE17'] = cleaned_df['PRE17'].astype(int)
cleaned_df['PRE19'] = cleaned_df['PRE19'].astype(int)
cleaned_df['PRE25'] = cleaned_df['PRE25'].astype(int)
cleaned_df['PRE30'] = cleaned_df['PRE30'].astype(int)
cleaned_df['PRE32'] = cleaned_df['PRE32'].astype(int)
cleaned_df['Risk1Yr'] = cleaned_df['Risk1Yr'].astype(int)

In [None]:
cleaned_df.dtypes

# Modeling on Risk1Yr

In [None]:
X = cleaned_df.drop('Risk1Yr', axis=1)
y = cleaned_df['Risk1Yr']
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [None]:
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=16))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [None]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

In [None]:
from sklearn.svm import SVC 

model = SVC(kernel='linear')

In [None]:
SVC(kernel='linear')

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

In [None]:
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import f1_score
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))

In [None]:
print(classification_report(y_test, predictions,
                            target_names=["T", "F"]))

# Pete Visualization

In [None]:
pete_df = df.drop(columns = ["DGN", "PRE6", "PRE7", 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19',
             'PRE25', 'PRE30', 'PRE32'])

In [None]:
cleaned_df = pete_df

In [None]:
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC11', '11', cleaned_df['PRE14'] )
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC12', '12', cleaned_df['PRE14'] )
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC13', '13', cleaned_df['PRE14'] )
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC14', '14', cleaned_df['PRE14'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'T', '0', cleaned_df['Risk1Yr'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'F', '1', cleaned_df['Risk1Yr'] )
cleaned_df.head()

In [None]:
cleaned_df['PRE14'] = cleaned_df['PRE14'].astype(int)

In [None]:
survived_df = cleaned_df[cleaned_df["Risk1Yr"] == '1']
survived_df.rename(columns={"PRE4": "FVC", "PRE5": "FEV",
                  "PRE14": "TumorSize", "AGE": "Age"}, inplace=True)
survived_df = survived_df.drop("Risk1Yr", axis=1)
survived_df.head()

In [None]:
survived_df.describe(include='all').loc[['mean']]

In [None]:
notSurvived_df = cleaned_df[cleaned_df["Risk1Yr"] == '0']
notSurvived_df.rename(columns={"PRE4": "FVC", "PRE5": "FEV",
                  "PRE14": "TumorSize", "AGE": "Age"}, inplace=True)
notSurvived_df = notSurvived_df.drop("Risk1Yr", axis=1)
notSurvived_df.head()

In [None]:
notSurvived_df.describe(include='all').loc[['mean']]