In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [2]:
# set seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [3]:
# read the csv file into a pandas DataFrame, drop id column
df = pd.read_csv("Resources/Data/ThoracicSurgery.csv")
df = df.drop("id", axis=1)

In [4]:
# set data for classifier 
target = df["Risk1Yr"]
target_names = ["T", "F"]
data = df.drop("Risk1Yr", axis=1)
data = pd.get_dummies(data)
feature_names = data.columns

In [5]:
# split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=1)

In [6]:
# create and score random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8617021276595744

In [7]:
# sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True) 

[(0.18112493347465042, 'PRE5'),
 (0.17134144561789622, 'PRE4'),
 (0.159356351562549, 'AGE'),
 (0.03002138278667311, 'DGN_DGN5'),
 (0.027762292765678532, 'PRE14_OC11'),
 (0.0267437945513764, 'DGN_DGN3'),
 (0.025202968373976894, 'PRE14_OC12'),
 (0.023257866712346614, 'PRE14_OC14'),
 (0.02244120352880268, 'DGN_DGN2'),
 (0.02137556421800881, 'PRE11_F'),
 (0.02055635109184197, 'PRE11_T'),
 (0.02047555773552332, 'PRE6_PRZ1'),
 (0.019714743869018902, 'PRE8_T'),
 (0.01939140105620525, 'PRE17_T'),
 (0.019262892924332534, 'PRE8_F'),
 (0.01775123542926093, 'PRE14_OC13'),
 (0.016005591991755153, 'PRE17_F'),
 (0.01587028845481526, 'PRE10_F'),
 (0.015693540981493087, 'PRE7_T'),
 (0.01523623043631555, 'PRE9_T'),
 (0.015185140501332177, 'PRE10_T'),
 (0.015088366391897005, 'PRE6_PRZ0'),
 (0.013996162206847072, 'PRE7_F'),
 (0.013863661637729941, 'PRE30_F'),
 (0.013675864997954838, 'PRE30_T'),
 (0.013226252982040509, 'PRE9_F'),
 (0.013160973805714652, 'DGN_DGN4'),
 (0.010295785950856968, 'DGN_DGN8'),
 (0

In [8]:
# establish X and y
X = data
y = target

In [9]:
# create train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [10]:
# scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# label-encode target data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [11]:
# create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=37))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [12]:
# compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               3800      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 14,102
Trainable params: 14,102
Non-trainable params: 0
_________________________________________________________________


In [13]:
# fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
12/12 - 1s - loss: 0.4898 - accuracy: 0.7819
Epoch 2/60
12/12 - 0s - loss: 0.4372 - accuracy: 0.8511
Epoch 3/60
12/12 - 0s - loss: 0.4098 - accuracy: 0.8511
Epoch 4/60
12/12 - 0s - loss: 0.3881 - accuracy: 0.8511
Epoch 5/60
12/12 - 0s - loss: 0.3806 - accuracy: 0.8511
Epoch 6/60
12/12 - 0s - loss: 0.3733 - accuracy: 0.8511
Epoch 7/60
12/12 - 0s - loss: 0.3648 - accuracy: 0.8511
Epoch 8/60
12/12 - 0s - loss: 0.3602 - accuracy: 0.8511
Epoch 9/60
12/12 - 0s - loss: 0.3527 - accuracy: 0.8511
Epoch 10/60
12/12 - 0s - loss: 0.3465 - accuracy: 0.8590
Epoch 11/60
12/12 - 0s - loss: 0.3405 - accuracy: 0.8564
Epoch 12/60
12/12 - 0s - loss: 0.3335 - accuracy: 0.8670
Epoch 13/60
12/12 - 0s - loss: 0.3310 - accuracy: 0.8644
Epoch 14/60
12/12 - 0s - loss: 0.3193 - accuracy: 0.8750
Epoch 15/60
12/12 - 0s - loss: 0.3173 - accuracy: 0.8750
Epoch 16/60
12/12 - 0s - loss: 0.3032 - accuracy: 0.8883
Epoch 17/60
12/12 - 0s - loss: 0.3006 - accuracy: 0.8777
Epoch 18/60
12/12 - 0s - loss: 0.2909 - 

<tensorflow.python.keras.callbacks.History at 0x14a9a940820>

In [14]:
# evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: 0.7031 - accuracy: 0.8404
Normal Neural Network - Loss: 0.7030636668205261, Accuracy: 0.8404255509376526


In [15]:
#  make predictions
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: ['F' 'F' 'F' 'F' 'F']
Actual Labels: ['F', 'F', 'F', 'F', 'F']




In [16]:
# create the SVC Model
from sklearn.svm import SVC 

model = SVC(kernel='linear')

In [17]:
SVC(kernel='linear')

SVC(kernel='linear')

In [18]:
# create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [19]:
# train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 2/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 3/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 4/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 5/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 1/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 2/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 3/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 4/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 5/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 1/5] END ...............................C=1, gamma=0.001; total time=   0.0s
[CV 2/5] END ...............................C=1,

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [20]:
 # fit the model using the grid search estimator 
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)
 # list the best parameters, score for this dataset
print(grid.best_params_)
print(grid.best_score_)

{'C': 5, 'gamma': 0.0001}
0.8510877192982456


In [21]:
# make predictions with hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

['F' 'F' 'T' 'F' 'F' 'F' 'T' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'T' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'T' 'T' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'T' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F']


In [22]:
# list the best score
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

Test Acc: 0.830


In [23]:
# f1 score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import f1_score
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))

0.7997973657548125

In [24]:
# calculate classification report
print(classification_report(y_test, predictions,
                            target_names=["T", "F"]))

              precision    recall  f1-score   support

           T       0.86      0.95      0.90        80
           F       0.33      0.14      0.20        14

    accuracy                           0.83        94
   macro avg       0.60      0.55      0.55        94
weighted avg       0.78      0.83      0.80        94



# Data Pre-Processing
### Data Cleaning

In [25]:
new_df = df[['DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE', 'Risk1Yr']]
new_df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


In [26]:
columns = [
           "DGN",
           "PRE4",
           "PRE5",
           "PRE6",
           "PRE7",
           "PRE8",
           "PRE9",
           "PRE10",
           "PRE11",
           "PRE14",
           "PRE17",
           "PRE19",
           "PRE25",
           "PRE30",
           "PRE32",
           "AGE",
           "Risk1Yr",
]

lived_df = new_df.loc[new_df["Risk1Yr"] == "F", columns ]
lived_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.40,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
5,DGN3,2.48,1.88,PRZ1,F,F,F,T,F,OC11,F,F,F,F,F,51,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,DGN2,3.88,2.12,PRZ1,F,F,F,T,F,OC13,F,F,F,T,F,63,F
466,DGN3,3.76,3.12,PRZ0,F,F,F,F,F,OC11,F,F,F,T,F,61,F
467,DGN3,3.04,2.08,PRZ1,F,F,F,T,F,OC13,F,F,F,F,F,52,F
468,DGN3,1.96,1.68,PRZ1,F,F,F,T,T,OC12,F,F,F,T,F,79,F


In [27]:
# counts
pre7_df = lived_df["PRE7"].value_counts()
pre8_df = lived_df["PRE8"].value_counts()
pre9_df = lived_df["PRE9"].value_counts()
pre10_df = lived_df["PRE10"].value_counts()
pre11_df = lived_df["PRE11"].value_counts()
pre14_df = lived_df["PRE14"].value_counts()
pre17_df = lived_df["PRE17"].value_counts()
pre19_df = lived_df["PRE19"].value_counts()
pre25_df = lived_df["PRE25"].value_counts()
pre30_df = lived_df["PRE30"].value_counts()
pre32_df = lived_df["PRE32"].value_counts()

print(pre7_df)
print(f"-------------------------")
print(pre8_df)
print(f"-------------------------")
print(pre9_df)
print(f"-------------------------")
print(pre10_df)
print(f"-------------------------")
print(pre11_df)
print(f"-------------------------")
print(pre14_df)
print(f"-------------------------")
print(pre17_df)
print(f"-------------------------")
print(pre19_df)
print(f"-------------------------")
print(pre25_df)
print(f"-------------------------")
print(pre30_df)
print(f"-------------------------")
print(pre32_df)

F    376
T     24
Name: PRE7, dtype: int64
-------------------------
F    346
T     54
Name: PRE8, dtype: int64
-------------------------
F    378
T     22
Name: PRE9, dtype: int64
-------------------------
T    268
F    132
Name: PRE10, dtype: int64
-------------------------
F    339
T     61
Name: PRE11, dtype: int64
-------------------------
OC12    218
OC11    159
OC13     13
OC14     10
Name: PRE14, dtype: int64
-------------------------
F    375
T     25
Name: PRE17, dtype: int64
-------------------------
F    398
T      2
Name: PRE19, dtype: int64
-------------------------
F    394
T      6
Name: PRE25, dtype: int64
-------------------------
T    323
F     77
Name: PRE30, dtype: int64
-------------------------
F    398
T      2
Name: PRE32, dtype: int64


In [28]:
DGN_df = lived_df["DGN"].value_counts()
DGN_df

DGN3    306
DGN2     40
DGN4     40
DGN5      8
DGN6      4
DGN8      1
DGN1      1
Name: DGN, dtype: int64

In [29]:
# Cleaning dataframe - adjusting string values into numeric processible values
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN1', '1', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN2', '2', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN3', '3', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN4', '4', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN5', '5', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN6', '6', new_df['DGN'] )
new_df['DGN'] = np.where(new_df['DGN'] == 'DGN8', '8', new_df['DGN'] )
new_df['PRE6'] = np.where(new_df['PRE6'] == 'PRZ2', '2', new_df['PRE6'] )
new_df['PRE6'] = np.where(new_df['PRE6'] == 'PRZ1', '1', new_df['PRE6'] )
new_df['PRE6'] = np.where(new_df['PRE6'] == 'PRZ0', '0', new_df['PRE6'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC11', '11', new_df['PRE14'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC12', '12', new_df['PRE14'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC13', '13', new_df['PRE14'] )
new_df['PRE14'] = np.where(new_df['PRE14'] == 'OC14', '14', new_df['PRE14'] )
new_df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,2,2.88,2.16,1,F,F,F,T,T,14,F,F,F,T,F,60,F
1,3,3.4,1.88,0,F,F,F,F,F,12,F,F,F,T,F,51,F
2,3,2.76,2.08,1,F,F,F,T,F,11,F,F,F,T,F,59,F
3,3,3.68,3.04,0,F,F,F,F,F,11,F,F,F,F,F,54,F
4,3,2.44,0.96,2,F,T,F,T,T,11,F,F,F,T,F,73,T


In [30]:
# Data Cleaning - copying dataframe to allow other adjustments for later
cleaned_df = new_df

In [31]:
cleaned_df['PRE7'] = np.where(cleaned_df['PRE7'] == 'T', '0', cleaned_df['PRE7'] )
cleaned_df['PRE7'] = np.where(cleaned_df['PRE7'] == 'F', '1', cleaned_df['PRE7'] )
cleaned_df['PRE8'] = np.where(cleaned_df['PRE8'] == 'T', '0', cleaned_df['PRE8'] )
cleaned_df['PRE8'] = np.where(cleaned_df['PRE8'] == 'F', '1', cleaned_df['PRE8'] )
cleaned_df['PRE9'] = np.where(cleaned_df['PRE9'] == 'T', '0', cleaned_df['PRE9'] )
cleaned_df['PRE9'] = np.where(cleaned_df['PRE9'] == 'F', '1', cleaned_df['PRE9'] )
cleaned_df['PRE10'] = np.where(cleaned_df['PRE10'] == 'T', '0', cleaned_df['PRE10'] )
cleaned_df['PRE10'] = np.where(cleaned_df['PRE10'] == 'F', '1', cleaned_df['PRE10'] )
cleaned_df['PRE11'] = np.where(cleaned_df['PRE11'] == 'T', '0', cleaned_df['PRE11'] )
cleaned_df['PRE11'] = np.where(cleaned_df['PRE11'] == 'F', '1', cleaned_df['PRE11'] )
cleaned_df['PRE17'] = np.where(cleaned_df['PRE17'] == 'T', '0', cleaned_df['PRE17'] )
cleaned_df['PRE17'] = np.where(cleaned_df['PRE17'] == 'F', '1', cleaned_df['PRE17'] )
cleaned_df['PRE19'] = np.where(cleaned_df['PRE19'] == 'T', '0', cleaned_df['PRE19'] )
cleaned_df['PRE19'] = np.where(cleaned_df['PRE19'] == 'F', '1', cleaned_df['PRE19'] )
cleaned_df['PRE25'] = np.where(cleaned_df['PRE25'] == 'T', '0', cleaned_df['PRE25'] )
cleaned_df['PRE25'] = np.where(cleaned_df['PRE25'] == 'F', '1', cleaned_df['PRE25'] )
cleaned_df['PRE30'] = np.where(cleaned_df['PRE30'] == 'T', '0', cleaned_df['PRE30'] )
cleaned_df['PRE30'] = np.where(cleaned_df['PRE30'] == 'F', '1', cleaned_df['PRE30'] )
cleaned_df['PRE32'] = np.where(cleaned_df['PRE32'] == 'T', '0', cleaned_df['PRE32'] )
cleaned_df['PRE32'] = np.where(cleaned_df['PRE32'] == 'F', '1', cleaned_df['PRE32'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'T', '0', cleaned_df['Risk1Yr'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'F', '1', cleaned_df['Risk1Yr'] )
cleaned_df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,2,2.88,2.16,1,1,1,1,0,0,14,1,1,1,0,1,60,1
1,3,3.4,1.88,0,1,1,1,1,1,12,1,1,1,0,1,51,1
2,3,2.76,2.08,1,1,1,1,0,1,11,1,1,1,0,1,59,1
3,3,3.68,3.04,0,1,1,1,1,1,11,1,1,1,1,1,54,1
4,3,2.44,0.96,2,1,0,1,0,0,11,1,1,1,0,1,73,0


In [32]:
cleaned_df['DGN'] = cleaned_df['DGN'].astype(int)
cleaned_df['PRE6'] = cleaned_df['PRE6'].astype(int)
cleaned_df['PRE7'] = cleaned_df['PRE7'].astype(int)
cleaned_df['PRE8'] = cleaned_df['PRE8'].astype(int)
cleaned_df['PRE9'] = cleaned_df['PRE9'].astype(int)
cleaned_df['PRE10'] = cleaned_df['PRE10'].astype(int)
cleaned_df['PRE11'] = cleaned_df['PRE11'].astype(int)
cleaned_df['PRE14'] = cleaned_df['PRE14'].astype(int)
cleaned_df['PRE17'] = cleaned_df['PRE17'].astype(int)
cleaned_df['PRE19'] = cleaned_df['PRE19'].astype(int)
cleaned_df['PRE25'] = cleaned_df['PRE25'].astype(int)
cleaned_df['PRE30'] = cleaned_df['PRE30'].astype(int)
cleaned_df['PRE32'] = cleaned_df['PRE32'].astype(int)
cleaned_df['Risk1Yr'] = cleaned_df['Risk1Yr'].astype(int)

In [33]:
cleaned_df.dtypes

DGN          int32
PRE4       float64
PRE5       float64
PRE6         int32
PRE7         int32
PRE8         int32
PRE9         int32
PRE10        int32
PRE11        int32
PRE14        int32
PRE17        int32
PRE19        int32
PRE25        int32
PRE30        int32
PRE32        int32
AGE          int64
Risk1Yr      int32
dtype: object

# Modeling on Risk1Yr

In [34]:
X = cleaned_df.drop('Risk1Yr', axis=1)
y = cleaned_df['Risk1Yr']
print(X.shape, y.shape)

(470, 16) (470,)


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [36]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [37]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [38]:
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [39]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=16))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [40]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               1700      
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
Total params: 12,002
Trainable params: 12,002
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
11/11 - 1s - loss: 0.4920 - accuracy: 0.8153
Epoch 2/60
11/11 - 0s - loss: 0.4247 - accuracy: 0.8523
Epoch 3/60
11/11 - 0s - loss: 0.4080 - accuracy: 0.8523
Epoch 4/60
11/11 - 0s - loss: 0.4086 - accuracy: 0.8523
Epoch 5/60
11/11 - 0s - loss: 0.4007 - accuracy: 0.8523
Epoch 6/60
11/11 - 0s - loss: 0.3989 - accuracy: 0.8523
Epoch 7/60
11/11 - 0s - loss: 0.3941 - accuracy: 0.8523
Epoch 8/60
11/11 - 0s - loss: 0.3927 - accuracy: 0.8523
Epoch 9/60
11/11 - 0s - loss: 0.3853 - accuracy: 0.8523
Epoch 10/60
11/11 - 0s - loss: 0.3817 - accuracy: 0.8523
Epoch 11/60
11/11 - 0s - loss: 0.3797 - accuracy: 0.8523
Epoch 12/60
11/11 - 0s - loss: 0.3772 - accuracy: 0.8523
Epoch 13/60
11/11 - 0s - loss: 0.3800 - accuracy: 0.8523
Epoch 14/60
11/11 - 0s - loss: 0.3724 - accuracy: 0.8523
Epoch 15/60
11/11 - 0s - loss: 0.3638 - accuracy: 0.8523
Epoch 16/60
11/11 - 0s - loss: 0.3647 - accuracy: 0.8494
Epoch 17/60
11/11 - 0s - loss: 0.3665 - accuracy: 0.8523
Epoch 18/60
11/11 - 0s - loss: 0.3622 - 

<tensorflow.python.keras.callbacks.History at 0x14a9bec7c40>

In [43]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

4/4 - 0s - loss: 0.5128 - accuracy: 0.8390
Normal Neural Network - Loss: 0.5127676725387573, Accuracy: 0.8389830589294434


In [44]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [45]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: [1 1 1 1 1]
Actual Labels: [1, 1, 1, 1, 1]


In [46]:
from sklearn.svm import SVC 

model = SVC(kernel='linear')

In [47]:
SVC(kernel='linear')

SVC(kernel='linear')

In [48]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [49]:
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 2/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 3/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 4/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 5/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 1/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 2/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 3/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 4/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 5/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 1/5] END ...............................C=1, gamma=0.001; total time=   0.0s
[CV 2/5] END ...............................C=1,

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [50]:
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

In [51]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 0.0001}
0.8523138832997988


In [52]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1]


In [53]:
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import f1_score
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))

0.9174311926605505

In [54]:
print(classification_report(y_test, predictions,
                            target_names=["T", "F"]))

              precision    recall  f1-score   support

           T       0.00      0.00      0.00        18
           F       0.85      1.00      0.92       100

    accuracy                           0.85       118
   macro avg       0.42      0.50      0.46       118
weighted avg       0.72      0.85      0.78       118



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Pete Visualization

In [55]:
pete_df = df.drop(columns = ["DGN", "PRE6", "PRE7", 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19',
             'PRE25', 'PRE30', 'PRE32'])

In [56]:
cleaned_df = pete_df

In [57]:
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC11', '11', cleaned_df['PRE14'] )
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC12', '12', cleaned_df['PRE14'] )
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC13', '13', cleaned_df['PRE14'] )
cleaned_df['PRE14'] = np.where(cleaned_df['PRE14'] == 'OC14', '14', cleaned_df['PRE14'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'T', '0', cleaned_df['Risk1Yr'] )
cleaned_df['Risk1Yr'] = np.where(cleaned_df['Risk1Yr'] == 'F', '1', cleaned_df['Risk1Yr'] )
cleaned_df.head()

Unnamed: 0,PRE4,PRE5,PRE14,AGE,Risk1Yr
0,2.88,2.16,14,60,1
1,3.4,1.88,12,51,1
2,2.76,2.08,11,59,1
3,3.68,3.04,11,54,1
4,2.44,0.96,11,73,0


In [58]:
cleaned_df['PRE14'] = cleaned_df['PRE14'].astype(int)

In [59]:
survived_df = cleaned_df[cleaned_df["Risk1Yr"] == '1']
survived_df.rename(columns={"PRE4": "FVC", "PRE5": "FEV",
                  "PRE14": "TumorSize", "AGE": "Age"}, inplace=True)
survived_df = survived_df.drop("Risk1Yr", axis=1)
survived_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,FVC,FEV,TumorSize,Age
0,2.88,2.16,14,60
1,3.4,1.88,12,51
2,2.76,2.08,11,59
3,3.68,3.04,11,54
5,2.48,1.88,11,51


In [63]:
print("Survivors")
survived_df.describe(include='all').loc[['mean']]

survivors


Unnamed: 0,FVC,FEV,TumorSize,Age
mean,3.298525,4.779375,11.685,62.3925


In [61]:
notSurvived_df = cleaned_df[cleaned_df["Risk1Yr"] == '0']
notSurvived_df.rename(columns={"PRE4": "FVC", "PRE5": "FEV",
                  "PRE14": "TumorSize", "AGE": "Age"}, inplace=True)
notSurvived_df = notSurvived_df.drop("Risk1Yr", axis=1)
notSurvived_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,FVC,FEV,TumorSize,Age
4,2.44,0.96,11,73
6,4.36,3.28,12,59
7,3.19,2.5,11,66
13,3.98,3.06,14,80
24,4.32,3.2,11,58


In [64]:
print("Non-Survivors")
notSurvived_df.describe(include='all').loc[['mean']]

Non-Survivors


Unnamed: 0,FVC,FEV,TumorSize,Age
mean,3.185143,3.364857,12.028571,63.342857
