In [104]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, precision_score, recall_score, accuracy_score, confusion_matrix


In [105]:
# Label Encoders to convert each qualitative variable into a quantitave variable

le = LabelEncoder()
le_gender = LabelEncoder()
le_rvsp = LabelEncoder()
le_rv_function = LabelEncoder()
le_size = LabelEncoder()
le_intervention = LabelEncoder()
le_sloe = LabelEncoder()

In [106]:
# Establishing feature columns and associated headers

included_cols = [3, 4, 5, 6, 7, 8, 9, 10, 19,
    20, 21, 22, 23, 25, 26, 27, 28, 29, 30]

cols = ['Sex (M/F)',
                'BMI',
                'DM (1/0)',
                'HTN (1/0)',
                'COPD (1/0)',
                'CTEPH (1/0)',
                'ESRD (1/0)',
                'Hx of Malignancy (1/0)',
                'Original EDA  (cm2)',
                'Original ESA (cm2)',
                'Original FAC (%)',
                'Original EndoGLS (%)',
                'Size/Location of Embolus',
                'RVSP',
                'RV Size',
                'RV Function',
                'McConnell\'s Sign',
                'TR Velocity',
                'Intervention']

# included_cols = [ 19, 20, 21, 22, 25, 26, 27, 29]

# cols = ['Original EDA  (cm2)',
#         'Original ESA (cm2)',
#         'Original FAC (%)',
#         'Original EndoGLS (%)',
#         'RVSP',
#         'RV Size',
#         'RV Function',
#         'TR Velocity']

In [107]:
# Function to seperate features and labels. The label_column defines the expected output in terms of suprevised learning. 

def seperate_features_and_labels(file, label_column):
    features = []
    labels = []
    for row in file:
        filt_row = list(row[i] for i in included_cols)
        features.append(filt_row)
        labels.append(row[label_column])

    labels_encoded = le.fit_transform(labels)

    # print(len(cols))
    df_features = pd.DataFrame(features, columns=cols)
    df_features['Sex (M/F)'] = le_gender.fit_transform(df_features['Sex (M/F)'])
    df_features['RVSP'] = le_rvsp.fit_transform(df_features['RVSP'])
    df_features['RV Size'] = le_size.fit_transform(df_features['RV Size'])
    df_features['RV Function'] = le_rv_function.fit_transform(df_features['RV Function'])
    df_features['Intervention'] = le_intervention.fit_transform(df_features['Intervention'])
    df_features['Size/Location of Embolus'] = le_sloe.fit_transform(df_features['Size/Location of Embolus'])

    df_features.to_csv(r'df_features.csv')

    cleaned_features = scaler.fit_transform(clean_and_mean((df_features)))
    return(cleaned_features, labels_encoded)

In [108]:
# Establishing Random Forest Classifier and MinMaxScalar to normalize all features

# Function to remove 'N/A' From feature columns and replace with mean

def clean_and_mean(lists):
    lists = lists.replace('N/A', np.NaN)
    num_lists = lists.apply(pd.to_numeric, errors ='coerce')
    cleaned_lists = num_lists.apply(lambda x: x.fillna(x.mean()))
    return(np.array(cleaned_lists))

scaler = MinMaxScaler(feature_range=(0, 1))

In [109]:
# f, l = seperate_features_and_labels(df.values, 42)
f, l = seperate_features_and_labels(df.values, 16)
d_f = pd.DataFrame(data=f, columns=cols)
d_f.head(20)

Unnamed: 0,Sex (M/F),BMI,DM (1/0),HTN (1/0),COPD (1/0),CTEPH (1/0),ESRD (1/0),Hx of Malignancy (1/0),Original EDA (cm2),Original ESA (cm2),Original FAC (%),Original EndoGLS (%),Size/Location of Embolus,RVSP,RV Size,RV Function,McConnell's Sign,TR Velocity,Intervention
0,0.0,0.361929,0.0,1.0,0.0,1.0,0.0,0.0,0.280124,0.000779,0.390293,0.543733,0.5,0.681818,0.333333,0.75,0.0,0.819284,0.5
1,0.0,0.119293,0.0,1.0,1.0,1.0,0.0,1.0,0.238176,0.000808,0.222667,0.702885,0.5,0.090909,0.666667,0.0,0.0,0.412948,0.5
2,0.0,0.142366,1.0,1.0,0.0,0.0,0.0,0.0,0.190315,0.000443,0.671754,0.719567,0.0,0.454545,0.0,0.25,0.0,0.778237,0.5
3,0.0,0.243004,1.0,1.0,0.0,0.0,0.0,0.0,0.455236,0.00118,0.299725,0.639315,0.5,0.227273,0.333333,0.5,0.0,0.566391,0.5
4,0.0,0.16998,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.001354,0.955467,0.182597,0.5,0.727273,1.0,1.0,0.0,0.571157,0.5
5,1.0,0.100638,0.0,0.0,0.0,0.0,1.0,0.0,0.016329,0.000132,0.74631,0.343553,0.5,0.742424,0.0,1.0,0.0,0.571157,0.5
6,1.0,0.16863,1.0,0.0,0.0,0.0,0.0,0.0,0.798986,0.002033,0.140105,0.809288,0.5,0.272727,1.0,1.0,0.0,0.571157,0.0
7,1.0,0.103093,0.0,0.0,0.0,0.0,0.0,0.0,0.221847,0.000915,0.0,1.0,0.0,0.757576,0.666667,0.5,0.0,0.579339,0.5
8,1.0,0.09082,1.0,1.0,0.0,0.0,0.0,1.0,0.262387,0.000787,0.328496,0.675383,0.5,0.409091,0.0,0.25,0.0,0.787328,0.5
9,1.0,0.008959,0.0,1.0,0.0,0.0,0.0,1.0,0.274493,0.000692,0.502627,0.417944,0.5,0.772727,1.0,1.0,0.0,0.245455,0.5


In [110]:
# Establishing Random Forest Classifier and MinMaxScalar to normalize all features

model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
scaler = MinMaxScaler(feature_range=(0, 1))

In [111]:
train, test, train_labels, test_labels = train_test_split(f, l, stratify = l, test_size = 0.3, random_state = 50)
model.fit(train, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [112]:
y_preds = model.predict(test)

prec_temp = precision_score(test_labels, y_preds)
recall_temp = recall_score(test_labels, y_preds)

f2 = 5 * ((prec_temp * recall_temp) / (4 * prec_temp + recall_temp))

print("F1 Score: {}".format(f1_score(test_labels, y_preds)))
print("F2 Score: {}".format(f2))
print("Prec Score: {}".format(prec_temp))
print("Recall Score: {}".format(recall_temp))
print("Accuracy Score: {}".format(accuracy_score(test_labels, y_preds)))

tn, fp, fn, tp = confusion_matrix(test_labels, y_preds).ravel()
print("true negative: {}".format(tn))
print("false positive: {}".format(fp))
print("false negative: {}".format(fn))
print("true positive: {}".format(tp))

F1 Score: 0.6666666666666666
F2 Score: 0.5555555555555556
Prec Score: 1.0
Recall Score: 0.5
Accuracy Score: 0.9565217391304348
true negative: 21
false positive: 0
false negative: 1
true positive: 1


In [113]:
# explainerModel = shap.TreeExplainer(model, data = pd.DataFrame(test))
# shap_values_model = explainerModel.shap_values(pd.DataFrame(test))

In [114]:
# load JS visualization code to notebook
# shap.initjs()

In [115]:
# test_df = pd.DataFrame(test)
# test_df.head()

In [116]:
# test_df.columns = cols

In [117]:
# shap.force_plot(explainerModel.expected_value, shap_values_model[10,:], test_df.iloc[10,:])

In [118]:
# shaps_df = pd.DataFrame(shap_values_model)
# shaps_df.columns = cols
# shaps_df

In [120]:
# Extract feature importances
fi = pd.DataFrame({'feature': list(cols),
                   'importance': model.feature_importances_}).\
                    sort_values('importance', ascending = False)

# Display
fi.head(20)

Unnamed: 0,feature,importance
11,Original EndoGLS (%),0.217988
10,Original FAC (%),0.14692
13,RVSP,0.13344
8,Original EDA (cm2),0.095968
9,Original ESA (cm2),0.076574
17,TR Velocity,0.053558
1,BMI,0.052455
14,RV Size,0.034935
7,Hx of Malignancy (1/0),0.030915
18,Intervention,0.029173
