In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [2]:
# set seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [3]:
# read the csv file into a pandas DataFrame, drop id column
df = pd.read_csv("Resources/Data/ThoracicSurgery.csv")
df = df.drop("id", axis=1)

In [4]:
# set data for classifier 
target = df["Risk1Yr"]
target_names = ["T", "F"]
data = df.drop("Risk1Yr", axis=1)
data = pd.get_dummies(data)
feature_names = data.columns

In [5]:
# split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=1)

In [6]:
# create and score random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8617021276595744

In [7]:
# sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True) 

[(0.18112493347465042, 'PRE5'),
 (0.17134144561789622, 'PRE4'),
 (0.159356351562549, 'AGE'),
 (0.03002138278667311, 'DGN_DGN5'),
 (0.027762292765678532, 'PRE14_OC11'),
 (0.0267437945513764, 'DGN_DGN3'),
 (0.025202968373976894, 'PRE14_OC12'),
 (0.023257866712346614, 'PRE14_OC14'),
 (0.02244120352880268, 'DGN_DGN2'),
 (0.02137556421800881, 'PRE11_F'),
 (0.02055635109184197, 'PRE11_T'),
 (0.02047555773552332, 'PRE6_PRZ1'),
 (0.019714743869018902, 'PRE8_T'),
 (0.01939140105620525, 'PRE17_T'),
 (0.019262892924332534, 'PRE8_F'),
 (0.01775123542926093, 'PRE14_OC13'),
 (0.016005591991755153, 'PRE17_F'),
 (0.01587028845481526, 'PRE10_F'),
 (0.015693540981493087, 'PRE7_T'),
 (0.01523623043631555, 'PRE9_T'),
 (0.015185140501332177, 'PRE10_T'),
 (0.015088366391897005, 'PRE6_PRZ0'),
 (0.013996162206847072, 'PRE7_F'),
 (0.013863661637729941, 'PRE30_F'),
 (0.013675864997954838, 'PRE30_T'),
 (0.013226252982040509, 'PRE9_F'),
 (0.013160973805714652, 'DGN_DGN4'),
 (0.010295785950856968, 'DGN_DGN8'),
 (0

In [8]:
# establish X and y
X = data
y = target

In [9]:
# create train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [10]:
# scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# label-encode target data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [11]:
# create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=37))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [12]:
# compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               3800      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 14,102
Trainable params: 14,102
Non-trainable params: 0
_________________________________________________________________


In [13]:
# fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Train on 376 samples
Epoch 1/60
376/376 - 1s - loss: 0.4867 - accuracy: 0.7926
Epoch 2/60
376/376 - 0s - loss: 0.4326 - accuracy: 0.8511
Epoch 3/60
376/376 - 0s - loss: 0.4154 - accuracy: 0.8511
Epoch 4/60
376/376 - 0s - loss: 0.4021 - accuracy: 0.8511
Epoch 5/60
376/376 - 0s - loss: 0.3908 - accuracy: 0.8511
Epoch 6/60
376/376 - 0s - loss: 0.3832 - accuracy: 0.8511
Epoch 7/60
376/376 - 0s - loss: 0.3730 - accuracy: 0.8511
Epoch 8/60
376/376 - 0s - loss: 0.3737 - accuracy: 0.8511
Epoch 9/60
376/376 - 0s - loss: 0.3654 - accuracy: 0.8511
Epoch 10/60
376/376 - 0s - loss: 0.3537 - accuracy: 0.8564
Epoch 11/60
376/376 - 0s - loss: 0.3458 - accuracy: 0.8564
Epoch 12/60
376/376 - 0s - loss: 0.3424 - accuracy: 0.8590
Epoch 13/60
376/376 - 0s - loss: 0.3353 - accuracy: 0.8750
Epoch 14/60
376/376 - 0s - loss: 0.3273 - accuracy: 0.8803
Epoch 15/60
376/376 - 0s - loss: 0.3208 - accuracy: 0.8723
Epoch 16/60
376/376 - 0s - loss: 0.3114 - accuracy: 0.8910
Epoch 17/60
376/376 - 0s - loss: 0.3028 - ac

<tensorflow.python.keras.callbacks.History at 0x7fb6231e1518>

In [14]:
# evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

94/1 - 0s - loss: 0.6239 - accuracy: 0.8191
Normal Neural Network - Loss: 0.6432622975491463, Accuracy: 0.8191489577293396


In [15]:
#  make predictions
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: ['F' 'F' 'T' 'F' 'F']
Actual Labels: ['F', 'F', 'F', 'F', 'F']


In [16]:
# create the SVC Model
from sklearn.svm import SVC 

model = SVC(kernel='linear')

In [17]:
SVC(kernel='linear')

SVC(kernel='linear')

In [18]:
# create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [19]:
# train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 2/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 3/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 4/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 5/5] END ..............................C=1, gamma=0.0001; total time=   0.0s
[CV 1/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 2/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 3/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 4/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 5/5] END ..............................C=1, gamma=0.0005; total time=   0.0s
[CV 1/5] END ...............................C=1, gamma=0.001; total time=   0.0s
[CV 2/5] END ...............................C=1,

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [20]:
 # fit the model using the grid search estimator 
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)
 # list the best parameters, score for this dataset
print(grid.best_params_)
print(grid.best_score_)

{'C': 5, 'gamma': 0.0001}
0.8510877192982456


In [21]:
# make predictions with hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

['F' 'F' 'T' 'F' 'F' 'F' 'T' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'T' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'T' 'T' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'T' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F']


In [22]:
# list the best score
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

Test Acc: 0.830


In [23]:
# f1 score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import f1_score
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))

0.7997973657548125

In [24]:
# calculate classification report
print(classification_report(y_test, predictions,
                            target_names=["T", "F"]))

              precision    recall  f1-score   support

           T       0.86      0.95      0.90        80
           F       0.33      0.14      0.20        14

    accuracy                           0.83        94
   macro avg       0.60      0.55      0.55        94
weighted avg       0.78      0.83      0.80        94



# Data Attributes for use in JavaScript

In [25]:
new_df = df[['DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE', 'Risk1Yr']]
new_df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


In [26]:
columns = ["DGN", "PRE4", "PRE5", "PRE6", "PRE7", "PRE8", "PRE9", "PRE10", "PRE11", "PRE14", "PRE17", "PRE19", "PRE25", "PRE30", "PRE32", "AGE", "Risk1Yr"]
lived_df = new_df.loc[new_df["Risk1Yr"] == "F", columns ]

In [27]:
# counts
pre7_df = lived_df["PRE7"].value_counts()
pre8_df = lived_df["PRE8"].value_counts()
pre9_df = lived_df["PRE9"].value_counts()
pre10_df = lived_df["PRE10"].value_counts()
pre11_df = lived_df["PRE11"].value_counts()
pre14_df = lived_df["PRE14"].value_counts()
pre17_df = lived_df["PRE17"].value_counts()
pre19_df = lived_df["PRE19"].value_counts()
pre25_df = lived_df["PRE25"].value_counts()
pre30_df = lived_df["PRE30"].value_counts()
pre32_df = lived_df["PRE32"].value_counts()

print(pre7_df)
print(f"-------------------------")
print(pre8_df)
print(f"-------------------------")
print(pre9_df)
print(f"-------------------------")
print(pre10_df)
print(f"-------------------------")
print(pre11_df)
print(f"-------------------------")
print(pre14_df)
print(f"-------------------------")
print(pre17_df)
print(f"-------------------------")
print(pre19_df)
print(f"-------------------------")
print(pre25_df)
print(f"-------------------------")
print(pre30_df)
print(f"-------------------------")
print(pre32_df)

F    376
T     24
Name: PRE7, dtype: int64
-------------------------
F    346
T     54
Name: PRE8, dtype: int64
-------------------------
F    378
T     22
Name: PRE9, dtype: int64
-------------------------
T    268
F    132
Name: PRE10, dtype: int64
-------------------------
F    339
T     61
Name: PRE11, dtype: int64
-------------------------
OC12    218
OC11    159
OC13     13
OC14     10
Name: PRE14, dtype: int64
-------------------------
F    375
T     25
Name: PRE17, dtype: int64
-------------------------
F    398
T      2
Name: PRE19, dtype: int64
-------------------------
F    394
T      6
Name: PRE25, dtype: int64
-------------------------
T    323
F     77
Name: PRE30, dtype: int64
-------------------------
F    398
T      2
Name: PRE32, dtype: int64


In [28]:
DGN_df = lived_df["DGN"].value_counts()
DGN_df

DGN3    306
DGN4     40
DGN2     40
DGN5      8
DGN6      4
DGN8      1
DGN1      1
Name: DGN, dtype: int64

# PRE, AGE, Risk1Yr Visualization

In [29]:
clean_df = df.drop(columns = ["DGN", "PRE6", "PRE7", 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19',
             'PRE25', 'PRE30', 'PRE32'])

In [30]:
clean_df['PRE14'] = np.where(clean_df['PRE14'] == 'OC11', '11', clean_df['PRE14'] )
clean_df['PRE14'] = np.where(clean_df['PRE14'] == 'OC12', '12', clean_df['PRE14'] )
clean_df['PRE14'] = np.where(clean_df['PRE14'] == 'OC13', '13', clean_df['PRE14'] )
clean_df['PRE14'] = np.where(clean_df['PRE14'] == 'OC14', '14', clean_df['PRE14'] )
clean_df['Risk1Yr'] = np.where(clean_df['Risk1Yr'] == 'T', '0', clean_df['Risk1Yr'] )
clean_df['Risk1Yr'] = np.where(clean_df['Risk1Yr'] == 'F', '1', clean_df['Risk1Yr'] )
clean_df.head()

Unnamed: 0,PRE4,PRE5,PRE14,AGE,Risk1Yr
0,2.88,2.16,14,60,1
1,3.4,1.88,12,51,1
2,2.76,2.08,11,59,1
3,3.68,3.04,11,54,1
4,2.44,0.96,11,73,0


In [31]:
clean_df['PRE14'] = clean_df['PRE14'].astype(int)

In [32]:
survived_df = clean_df[clean_df["Risk1Yr"] == '1']
survived_df.rename(columns={"PRE4": "FVC", "PRE5": "FEV",
                  "PRE14": "TumorSize", "AGE": "Age"}, inplace=True)
survived_df = survived_df.drop("Risk1Yr", axis=1)
survived_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,FVC,FEV,TumorSize,Age
0,2.88,2.16,14,60
1,3.4,1.88,12,51
2,2.76,2.08,11,59
3,3.68,3.04,11,54
5,2.48,1.88,11,51


In [33]:
print("Survivors")
survived_df.describe(include='all').loc[['mean']]

Survivors


Unnamed: 0,FVC,FEV,TumorSize,Age
mean,3.298525,4.779375,11.685,62.3925


In [34]:
notSurvived_df = clean_df[clean_df["Risk1Yr"] == '0']
notSurvived_df.rename(columns={"PRE4": "FVC", "PRE5": "FEV",
                  "PRE14": "TumorSize", "AGE": "Age"}, inplace=True)
notSurvived_df = notSurvived_df.drop("Risk1Yr", axis=1)
print("Non-Survivors")
notSurvived_df.describe(include='all').loc[['mean']]

Non-Survivors


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,FVC,FEV,TumorSize,Age
mean,3.185143,3.364857,12.028571,63.342857
