In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_excel("PTA.xlsx")
pd.set_option("max_columns", 9999)
rand_state=5

# Data processing

In [2]:
# drop the 1 missing age row:

data['Age'] = data['Age'].dropna()

In [3]:
#convert continuous age variable to categorical:

for idx,row in data.iterrows():
    if row['Age'] < 18:
        data.loc[idx, 'Age_cat'] = "Teenager"
    elif row['Age'] >= 18 and row['Age'] < 30:
        data.loc[idx, 'Age_cat'] = "Young Adult"
    elif row['Age'] >= 30 and row['Age'] < 50:
        data.loc[idx, 'Age_cat'] = "Adult"
    elif row['Age'] >50:
        data.loc[idx, 'Age_cat'] = "50+"

def create_dummies(df,column_name):
    
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

data = create_dummies(data, 'Age_cat')

In [4]:
#convert gender to numerical:

for index, row in data.iterrows():
    if row['Gender'] == "M":
        data.loc[index,'Gender'] = 1
    else:
        data.loc[index,'Gender'] = 0

In [5]:
#drop NR (null) value from Duration of symptoms:

data = data [data['Duration Sxs (days)']!= "NR"] 

In [6]:
# deal with missing values for WBC count:

import numpy as np
for idx,row in data.iterrows():
    if row['WBC '] == 'Not Performed':
        data.loc[idx, 'WBC ']=np.nan
    elif row['WBC '] == 0:
        data.loc[idx, 'WBC ']=np.nan
        
data['WBC '] = data['WBC '].astype("float64")

In [7]:
# converting Previously Tx to categotical columnns:

for idx,row in data.iterrows():
    if row['Previously Tx'] == 1:
        data.loc[idx, 'Previously Tx'] = "Antibiotics Alone"
    elif row['Previously Tx'] == 2:
        data.loc[idx, 'Previously Tx'] = "Steroids Alone"
    elif row['Previously Tx'] == 3:
        data.loc[idx, 'Previously Tx'] = "Abx Steroids"
    elif row['Previously Tx'] == 4:
        data.loc[idx, 'Previously Tx'] = "Abx + Aspiration attempt"
    elif row['Previously Tx'] == 5:
        data.loc[idx, 'Previously Tx'] = "Pain Meds alone"
    elif row['Previously Tx'] == 0:
        data.loc[idx, 'Previously Tx'] = "no treatment"
        
data = create_dummies(data, 'Previously Tx').drop(columns='Previously Tx')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1056 entries, 0 to 1056
Data columns (total 74 columns):
MRN                                       1056 non-null int64
Month                                     1055 non-null float64
Date                                      1055 non-null datetime64[ns]
Age                                       1055 non-null float64
Gender                                    1056 non-null int64
Duration Sxs (days)                       1055 non-null object
Fever                                     1049 non-null float64
Sore Throat                               1055 non-null float64
Worsening of Symptoms                     1051 non-null float64
Otalgia                                   1055 non-null float64
Trismus                                   1054 non-null float64
Cough                                     1048 non-null float64
Voice Change                              1054 non-null float64
Dysphagia                                 1055 non-null flo

In [9]:
#WBC was excluded because it is not very correlated and not always available pre-aspiration attempt

In [10]:
#convert to numeric, drop rows with missing data
data['Duration Sxs (days)'] = data['Duration Sxs (days)'].astype("float64")
data = data[(data['Tonsillectomy'] != "Previous") & (data['Tonsillectomy'] != "-")]
data['Tonsillectomy'] = data['Tonsillectomy'].astype('float64')
data = data[['Age','Gender', 'Duration Sxs (days)', 'Fever', 'Otalgia', 
           'Trismus','Cough', 'Dysphagia', 'Anorexia', 'Worsening of Symptoms', 'Age_cat_50+', 
            'Age_cat_Teenager', 'Age_cat_Adult', 'Age_cat_Young Adult', 'Pus', 'Tonsillectomy', 'Previously Tx_no treatment', 
             'Previously Tx_Steroids Alone', 'Previously Tx_Pain Meds alone', 'Previously Tx_Abx Steroids',
             'Previously Tx_Abx + Aspiration attempt', 'Previously Tx_6.0', 'Neck Pain']]

data = data.dropna()

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 892 entries, 0 to 1056
Data columns (total 23 columns):
Age                                       892 non-null float64
Gender                                    892 non-null int64
Duration Sxs (days)                       892 non-null float64
Fever                                     892 non-null float64
Otalgia                                   892 non-null float64
Trismus                                   892 non-null float64
Cough                                     892 non-null float64
Dysphagia                                 892 non-null float64
Anorexia                                  892 non-null float64
Worsening of Symptoms                     892 non-null float64
Age_cat_50+                               892 non-null uint8
Age_cat_Teenager                          892 non-null uint8
Age_cat_Adult                             892 non-null uint8
Age_cat_Young Adult                       892 non-null uint8
Pus                   

In [11]:
print('mean age = {}, ({}-{})'.format(np.mean(data['Age']), data['Age'].min(), data['Age'].max()))
print("Percentage male = {}".format(data['Gender'].sum()/data['Gender'].shape[0]))
print("")
print("percent successful aspiration: {}".format(data['Pus'].sum()/data.shape[0]))

mean age = 24.65493273542601, (1.2-88.0)
Percentage male = 0.5011210762331838

percent successful aspiration: 0.625560538116592


892 rows remain after dropping all with missing values

## Correlations

In [12]:
corr = data[['Age','Gender', 'Duration Sxs (days)', 'Fever', 'Otalgia', 
           'Trismus','Cough', 'Dysphagia', 'Anorexia', 'Worsening of Symptoms', 'Age_cat_50+', 
            'Age_cat_Teenager', 'Age_cat_Adult', 'Age_cat_Young Adult', 'Pus', 'Previously Tx_no treatment', 
             'Previously Tx_Steroids Alone', 'Previously Tx_Pain Meds alone', 'Previously Tx_Abx Steroids',
             'Previously Tx_Abx + Aspiration attempt', 'Previously Tx_6.0', 'Neck Pain']].corr()
corr['Pus'].sort_values(ascending=False)


Pus                                       1.000000
Trismus                                   0.402845
Worsening of Symptoms                     0.271783
Duration Sxs (days)                       0.118878
Otalgia                                   0.079844
Previously Tx_Steroids Alone              0.074299
Dysphagia                                 0.071082
Previously Tx_Abx Steroids                0.053464
Age_cat_Young Adult                       0.051062
Previously Tx_Abx + Aspiration attempt    0.046287
Gender                                    0.038797
Previously Tx_6.0                         0.036676
Anorexia                                  0.022655
Age                                       0.011300
Previously Tx_Pain Meds alone             0.006192
Age_cat_50+                              -0.002991
Age_cat_Adult                            -0.014064
Fever                                    -0.026715
Age_cat_Teenager                         -0.046934
Cough                          

In [13]:
pd.pivot_table(data, index=['Pus'],  values = ['Trismus', 'Worsening of Symptoms', 'Duration Sxs (days)', 
                                               'Otalgia', 'Dysphagia', 'Gender', 'Anorexia', 'Age', 'Fever',
                                               'Cough', 'Neck Pain', 'Previously Tx_no treatment' ])

Unnamed: 0_level_0,Age,Anorexia,Cough,Duration Sxs (days),Dysphagia,Fever,Gender,Neck Pain,Otalgia,Previously Tx_no treatment,Trismus,Worsening of Symptoms
Pus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0.0,24.485629,0.464072,0.068862,5.907186,0.601796,0.452096,0.476048,0.257485,0.538922,0.55988,0.332335,0.58982
1.0,24.756272,0.487455,0.039427,7.137993,0.672043,0.424731,0.516129,0.179211,0.620072,0.460573,0.741935,0.835125


In [14]:
pd.pivot_table(data, index='Otalgia', values = 'Pus')
data['Pus'].value_counts()

1.0    558
0.0    334
Name: Pus, dtype: int64

Trismus looks much more predictive of successful aspiration than otalgia

# Models

### Random Forest


In [19]:
#dictionary for storing performance measures for comparison between models
accuracy_dict = {}

from sklearn.ensemble import RandomForestClassifier
# we include the 6 features most correlated with the presence of pus.
features = ['Duration Sxs (days)', 'Otalgia', 
           'Trismus','Worsening of Symptoms', 
            'Neck Pain', 'Previously Tx_no treatment']

In [20]:
# create training and holdout sets:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], test_size=0.3, random_state=rand_state)
X_train = normalize(X_train)
X_test = normalize(X_test)

In [21]:
# Grid search for parameter optimization
# this may take a minute to run...

from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
grid = GridSearchCV(rf, {
            "n_estimators": [5, 10, 100],
            "criterion": ['gini'],
            "max_depth": [1, 3, 5, 7, 9, 15],
            "max_features": ["log2"],
            "min_samples_leaf": [1, 3, 5, 8, 10],
            "min_samples_split": [2, 3, 5, 8, 10],
            'random_state': [rand_state],
            'class_weight': ['balanced', 'balanced_subsample']
        }, cv = 10)
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_
print(grid.best_params_)
print(grid.best_score_)

{'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 10, 'random_state': 5}
0.719551282051


#### Evaluating model performance

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

rf_accuracy = []
rf_accuracy_train = []
rf_ppv = []
rf_npv = []
rf_sensitivity = []
rf_specificity = []

for i in range (300):

    X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                        test_size=0.30, random_state=i)
    X_train = normalize(X_train)
    X_test = normalize(X_test)

    best_rf.fit(X_train, y_train)
    predictions = best_rf.predict(X_test)
    predictions_train = best_rf.predict(X_train)
#calculate sensitivity, specificity, ppv, npv
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predictions)
    npv = precision[0]
    ppv = precision[1]
    specificity = recall[0]
    sensitivity = recall[1]
#append to lists 
    rf_accuracy.append(accuracy_score(y_test, predictions))
    rf_accuracy_train.append(accuracy_score(y_train, predictions_train))
    rf_ppv.append(ppv)
    rf_npv.append(npv)
    rf_sensitivity.append(sensitivity)
    rf_specificity.append(specificity)    

accuracy_dict['Random Forest'] = np.mean(rf_accuracy)
print("Accuracy: {}({}) \n Sensitivity: {}({}) \n Specificity: {} ({})".format(np.mean(rf_accuracy), 
                                                                               np.std(rf_accuracy), 
                                                                               np.mean(rf_sensitivity),
                                                                               np.std(rf_sensitivity),
                                                                               np.mean(rf_specificity),
                                                                              np.std(rf_specificity)))


Accuracy: 0.7061691542288557(0.02395118092798047) 
 Sensitivity: 0.7357076845030353(0.04099373414847039) 
 Specificity: 0.6578041380440068 (0.05058299407749282)


Accuracy on the test and training sets are similar, suggesting no overfitting.

### Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                    test_size=0.3, random_state=rand_state)
X_train = normalize(X_train)
X_test = normalize(X_test)

lr = LogisticRegression()
grid = GridSearchCV(lr, {"solver": ["newton-cg", "lbfgs", "liblinear"], 'class_weight': ['', 'balanced']}, cv=10)
grid.fit(X_train, y_train)
best_lr = grid.best_estimator_
print(grid.best_score_)
print(grid.best_params_)

0.711538461538
{'class_weight': 'balanced', 'solver': 'liblinear'}


In [24]:
# evaluate performance of LR
X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                    test_size=0.3, random_state=rand_state)
X_train = normalize(X_train)
X_test = normalize(X_test)

best_lr.fit(X_train, y_train)
predictions = best_lr.predict(X_test)
predictions_train = best_lr.predict(X_train)

print(accuracy_score(y_test, predictions), "\n\n", classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))

0.679104477612 

              precision    recall  f1-score   support

        0.0       0.58      0.70      0.64       107
        1.0       0.77      0.66      0.71       161

avg / total       0.69      0.68      0.68       268

0.682765426366


In [25]:
lr_accuracy = []
lr_accuracy_train = []
lr_ppv = []
lr_npv = []
lr_sensitivity = []
lr_specificity = []

for i in range (300):

    X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                        test_size=0.30, random_state=i)
    X_train = normalize(X_train)
    X_test = normalize(X_test)

    best_lr.fit(X_train, y_train)
    predictions = best_lr.predict(X_test)
    predictions_train = best_lr.predict(X_train)
#calculate sensitivity, specificity, ppv, npv
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predictions)
    npv = precision[0]
    ppv = precision[1]
    specificity = recall[0]
    sensitivity = recall[1]
#append to lists 
    lr_accuracy.append(accuracy_score(y_test, predictions))
    lr_accuracy_train.append(accuracy_score(y_train, predictions_train))
    lr_ppv.append(ppv)
    lr_npv.append(npv)
    lr_sensitivity.append(sensitivity)
    lr_specificity.append(specificity)    

accuracy_dict['Log regression'] = np.mean(lr_accuracy)
    
    
print("Accuracy: {}({}) \n Sensitivity: {}({}) \n Specificity: {} ({})".format(np.mean(lr_accuracy), 
                                                                               np.std(lr_accuracy), 
                                                                               np.mean(lr_sensitivity),
                                                                               np.std(lr_sensitivity),
                                                                               np.mean(lr_specificity),
                                                                              np.std(lr_specificity)))


Accuracy: 0.7125746268656716(0.02254786456915147) 
 Sensitivity: 0.7668510071227278(0.03928385333716861) 
 Specificity: 0.6229906638426396 (0.05560317476236639)


Logistic regression is less accurate than Random Forest

### K Nearest Neighbors

In [26]:
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                    test_size=0.3, random_state=rand_state)
X_train = normalize(X_train)
X_test = normalize(X_test)

knn = KNeighborsClassifier()
grid = GridSearchCV(knn, {
            'n_neighbors': range(1,20,2),
            'weights': ['distance', 'uniform'],
            'algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'p': [1,2]
        }, cv=10)
grid.fit(X_train, y_train)
best_knn = grid.best_estimator_
print(grid.best_score_)
print(grid.best_params_)

0.735576923077
{'algorithm': 'brute', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [27]:
knn_accuracy = []
knn_accuracy_train = []
knn_ppv = []
knn_npv = []
knn_sensitivity = []
knn_specificity = []

for i in range (300):

    X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                        test_size=0.30, random_state=i)
    X_train = normalize(X_train)
    X_test = normalize(X_test)

    best_knn.fit(X_train, y_train)
    predictions = best_knn.predict(X_test)
    predictions_train = best_knn.predict(X_train)
#calculate sensitivity, specificity, ppv, npv
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predictions)
    npv = precision[0]
    ppv = precision[1]
    specificity = recall[0]
    sensitivity = recall[1]
#append to lists 
    knn_accuracy.append(accuracy_score(y_test, predictions))
    knn_accuracy_train.append(accuracy_score(y_train, predictions_train))
    knn_ppv.append(ppv)
    knn_npv.append(npv)
    knn_sensitivity.append(sensitivity)
    knn_specificity.append(specificity)    

accuracy_dict['KNN'] = np.mean(knn_accuracy)    
    
print("Accuracy: {}({}) \n Sensitivity: {}({}) \n Specificity: {} ({})".format(np.mean(knn_accuracy), 
                                                                               np.std(knn_accuracy), 
                                                                               np.mean(knn_sensitivity),
                                                                               np.std(knn_sensitivity),
                                                                               np.mean(knn_specificity),
                                                                              np.std(knn_specificity)))


Accuracy: 0.6999129353233831(0.022548385993197938) 
 Sensitivity: 0.8244571480287384(0.03534493819520581) 
 Specificity: 0.49646696069718377 (0.05468574954254248)


In [28]:
# evaluate knn performance:
X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                    test_size=0.3, random_state=rand_state)
X_train = normalize(X_train)
X_test = normalize(X_test)

best_knn.fit(X_train, y_train)
predictions = best_knn.predict(X_test)
predictions_train = best_knn.predict(X_train)
    
print(accuracy_score(y_test, predictions) , "\n\n", classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))

0.708955223881 

              precision    recall  f1-score   support

        0.0       0.67      0.52      0.59       107
        1.0       0.72      0.83      0.77       161

avg / total       0.70      0.71      0.70       268

0.677831311314


### Neural Network

In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                    test_size=0.3, random_state=rand_state)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter = 2000, random_state=rand_state)
mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(accuracy , "\n\n", classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))


0.731343283582 

              precision    recall  f1-score   support

        0.0       0.72      0.54      0.62       107
        1.0       0.74      0.86      0.79       161

avg / total       0.73      0.73      0.72       268

0.699599465955


In [30]:
#evaluate optimal parameters:
X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                    test_size=0.3, random_state=rand_state)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier()
grid = GridSearchCV(mlp, {
            'alpha': [10, 1, .01, .001],
            'max_iter': [2000],
            'hidden_layer_sizes': [[300, 150], [300, 100, 50]],
            'random_state':[rand_state]}, cv=10)
grid.fit(X_train, y_train)
best_mlp = grid.best_estimator_
print(grid.best_score_)
print(grid.best_params_)

0.748397435897
{'alpha': 1, 'hidden_layer_sizes': [300, 150], 'max_iter': 2000, 'random_state': 5}


In [31]:
#evaluate performance:
best_mlp.fit(X_train, y_train)

predictions = best_mlp.predict(X_test)
predictions_train = best_mlp.predict(X_train)

print(accuracy_score(y_test, predictions), classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))

0.731343283582              precision    recall  f1-score   support

        0.0       0.75      0.50      0.60       107
        1.0       0.73      0.89      0.80       161

avg / total       0.73      0.73      0.72       268

0.691762930284


In [32]:
mlp_accuracy = []
mlp_accuracy_train = []
mlp_ppv = []
mlp_npv = []
mlp_sensitivity = []
mlp_specificity = []

for i in range (300):

    X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                        test_size=0.30, random_state=i)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    mlp = MLPClassifier(max_iter=2000, alpha=1, hidden_layer_sizes=[300, 150, 100])
    best_mlp.fit(X_train, y_train)
    predictions = best_mlp.predict(X_test)
    predictions_train = best_mlp.predict(X_train)
#calculate sensitivity, specificity, ppv, npv
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predictions)
    npv = precision[0]
    ppv = precision[1]
    specificity = recall[0]
    sensitivity = recall[1]
#append to lists 
    mlp_accuracy.append(accuracy_score(y_test, predictions))
    mlp_accuracy_train.append(accuracy_score(y_train, predictions_train))
    mlp_ppv.append(ppv)
    mlp_npv.append(npv)
    mlp_sensitivity.append(sensitivity)
    mlp_specificity.append(specificity)    

accuracy_dict['MLP'] = np.mean(mlp_accuracy)

print("Accuracy: {}({}) \n Sensitivity: {}({}) \n Specificity: {} ({})".format(np.mean(mlp_accuracy), 
                                                                               np.std(mlp_accuracy), 
                                                                               np.mean(mlp_sensitivity),
                                                                               np.std(mlp_sensitivity),
                                                                               np.mean(mlp_specificity),
                                                                              np.std(mlp_specificity)))


Accuracy: 0.7233457711442787(0.02197677429020845) 
 Sensitivity: 0.8654378549875923(0.03567849810642601) 
 Specificity: 0.4910720529688987 (0.06072565707223755)


Superior to Random Forest

## SVM

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_accuracy = []
gbc_accuracy_train = []
gbc_ppv = []
gbc_npv = []
gbc_sensitivity = []
gbc_specificity = []

for i in range (300):

    X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                        test_size=0.30, random_state=i)

    X_train = normalize(X_train)
    X_test = normalize(X_test)
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    predictions = gbc.predict(X_test)
    predictions_train = gbc.predict(X_train)
#calculate sensitivity, specificity, ppv, npv
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predictions)
    npv = precision[0]
    ppv = precision[1]
    specificity = recall[0]
    sensitivity = recall[1]
#append to lists 
    gbc_accuracy.append(accuracy_score(y_test, predictions))
    gbc_accuracy_train.append(accuracy_score(y_train, predictions_train))
    gbc_ppv.append(ppv)
    gbc_npv.append(npv)
    gbc_sensitivity.append(sensitivity)
    gbc_specificity.append(specificity)    

print("Accuracy: {}({}) \n Sensitivity: {}({}) \n Specificity: {} ({})".format(np.mean(gbc_accuracy), 
                                                                               np.std(gbc_accuracy), 
                                                                               np.mean(gbc_sensitivity),
                                                                               np.std(gbc_sensitivity),
                                                                               np.mean(gbc_specificity),
                                                                              np.std(gbc_specificity)))
accuracy_dict['SVM'] = np.mean(gbc_accuracy)

Accuracy: 0.7026616915422886(0.021830268091242517) 
 Sensitivity: 0.8143741837055172(0.03579171051552516) 
 Specificity: 0.5205591240577001 (0.053286871529534305)


In [34]:
print(accuracy_dict)


{'Random Forest': 0.70616915422885573, 'Log regression': 0.71257462686567163, 'KNN': 0.69991293532338306, 'MLP': 0.72334577114427867, 'SVM': 0.70266169154228864}


For reference, ENT specialists are only 64% accurate, so this is better than human prediction

0.730769230769
{'C': 1, 'class_weight': 'balanced', 'shrinking': True}


This is leftover code from other explorations

In [None]:
from sklearn.svm import SVC

svc = SVC()
grid = GridSearchCV(svc, {
            'C': [100, 10, 1, .01, .001, .0001],
            'shrinking': [True, False],
            'class_weight':['balanced','']}, cv=10)
grid.fit(X_train, y_train)
best_svc = grid.best_estimator_
print(grid.best_score_)
print(grid.best_params_)

In [36]:


svc_accuracy = []
svc_accuracy_train = []
for i in range (100):

    X_train, X_test, y_train, y_test = train_test_split(data[features], data['Pus'], 
                                                        test_size=0.3, random_state=i)


    X_train = normalize(X_train)
    X_test = normalize(X_test)
    best_svc.fit(X_train, y_train)
    predictions = best_svc.predict(X_test)
    predictions_train = best_svc.predict(X_train)
    svc_accuracy.append(accuracy_score(y_test, predictions))
    svc_accuracy_train.append(accuracy_score(y_train, predictions_train))

print(np.mean(svc_accuracy), np.mean(svc_accuracy_train))
print(roc_auc_score(y_test, predictions))



0.717014925373 0.725625
0.652934407365


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def model_selector(df, features):
    
    models = [{
        "name": 'K Neighbors Classifier', 
        'estimator':KNeighborsClassifier(),
        'hyperparameters': {
            'n_neighbors': range(1,20,2),
            'weights': ['distance', 'uniform'],
            'algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'p': [1,2]
        }
    }, 
    {
        "name": 'Logistic Regression',
        'estimator':LogisticRegression(),
        'hyperparameters': {
            "solver": ["newton-cg", "lbfgs", "liblinear"], 
        }
        
    }, 
    {
        'name': "Random Forest Classifier",
        'estimator':RandomForestClassifier(), 
        'hyperparameters': {
            "n_estimators": [10, 50, 200],
            "criterion": ["entropy", "gini"],
            "max_depth": [2, 5, 10],
            "max_features": ["log2", "sqrt"],
            "min_samples_leaf": [1, 5, 8],
            "min_samples_split": [2, 3, 5]
        }
    }]
    for model in models:
        print(model['name'], "\n_____________\n")
        grid = GridSearchCV(model["estimator"], 
                            param_grid=model["hyperparameters"], 
                           cv = 10)
        grid.fit(X_train, y_train)
        model['best model'] = grid.best_estimator_
        model['best parameters'] = grid.best_params_
        model['best score'] = grid.best_score_
        print('best score:',grid.best_score_)
        print('best parameters:', grid.best_params_)
        print("\n\n")
    return models
best_models = model_selector(data, best_features)
        
    

NameError: name 'best_features' is not defined

In [None]:
from sklearn.metrics import classification_report
rf = RandomForestClassifier(max_depth=2, max_features='log2', min_samples_leaf=5, 
                            min_samples_split=2, n_estimators=200)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
predictions_train = rf.predict(X_train)
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions), accuracy_score(y_train, predictions_train))

## Predicting Tonsillectomy

In [None]:
corr_tonsillectomy = data.corr()
(corr_tonsillectomy['Tonsillectomy']).sort_values(ascending=False)

In [None]:
best_tonsil_model = model_selector(data, best_features, 'Pus')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[best_features], data['Tonsillectomy'], test_size=0.33, random_state=42)
X_train = normalize(X_train)
X_test = normalize(X_test)

rf = RandomForestClassifier(criterion='gini', max_depth=10, max_features='log2', min_samples_leaf=20, 
                            min_samples_split=2, n_estimators=4, class_weight='balanced')
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
predictions_train = rf.predict(X_train)
accuracy = accuracy_score(y_test, predictions)
accuracy_train = accuracy_score(y_train, predictions_train)
print(accuracy_train, accuracy)
predictions

In [None]:
print(classification_report(y_test, predictions))


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter = 2000)
mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracy

In [None]:
pos_pus = data[data['Pus']==1]
neg_pus = data[data['Pus']==0]
samp = pos_pus.sample(n=neg_pus.shape[0])
subsampled_data = pd.concat([neg_pus, samp], axis=0)
subsampled_data