### Data Mining on Hospital Dataset

This notebook looks at the Hospital dataset and builds a model to predict the onset of diabetes

### Imports

In [278]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import Imputer
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from IPython.display import Markdown, display
import matplotlib.patches as mpatches
from IPython.display import HTML
from operator import itemgetter


import pandas as pd
import numpy as np


# pd.options.mode.chained_assignment = None 

### Helper functions

In [279]:
def print_bold(text):
    display(Markdown("**{}**".format(text)))
    
def print_color(text, color='red'):
    display(HTML("<text style=color:{}>{}</text>".format(color, text)))


def display_confusion_matrix(target_test, target_predict):
    print_bold('Confusion Matrix')
    y_actu = pd.Series(target_test, name='Actual')
    y_pred = pd.Series(target_predict, name='Predicted')
    print(pd.crosstab(y_actu, y_pred))
    
# Basic graph plotting utility
def plot_roc_curve(confusion_matrix_list, models, x_label, y_label, model_names=''):
    
    tpr_fpr_list = []
    
    for idx, cm in enumerate(confusion_matrix_list):
    
        tp = cm[1, 1]
        tn = cm[0, 0]
        fp = cm[0, 1]
        fn = cm[1, 0]
        
        
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)
        
        print(models[idx][0], 'TPR:' + tpr, 'FPR:' + fpr)
    
        tpr_fpr_list.append((round(fpr, 2), round(tpr , 2)))
        
    tpr_fpr_list.sort(key=itemgetter(0))    
    handles = []
    
    for model_name, model, color in models:
        patch = mpatches.Patch(color=color, label=model_name)
        handles.append(patch)
        
    fpr_list, tpr_list = zip(*tpr_fpr_list)
    
    fig, ax = plt.subplots()
    plt.plot(fpr_list, tpr_list, 'bs-')
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    for idx, _   in enumerate(tpr_list):
        
        tpr = tpr_list[idx]
        fpr = fpr_list[idx]
#         plt.plot([fpr, fpr], [0, tpr], 'ks-') # vertical
#         plt.plot([0, fpr], [tpr, tpr], 'ks-') # horizontal
        plt.plot([fpr], [tpr], marker='o',  markersize=10, color=models[idx][2])
    plt.legend(handles=handles)
    plt.show()

### Read dataset

In [280]:
DATASET_NAME = 'datasets/diabetes_hospital.csv'
df = pd.read_csv(DATASET_NAME)
df.head()

FileNotFoundError: File b'dataset/diabetes_hospital.csv' does not exist

### Find out missing values in the dataset

In [None]:
attributes = ['race', 'gender', 'age', 'medical_specialty','change','diabetesMed', 'readmitted']
for attr in attributes:
    print(attr,": ", sum(df[attr] == '?'))

### Show statistics of the dataset

In [None]:
df.describe().transpose()

### Drop columns
Drop attributes like IDs and other irrelevant attributes

In [None]:
bad_features = ['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'admission_type_id', 
                'discharge_disposition_id', 'admission_source_id', 'number_outpatient', 'number_emergency', 
                'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'examide', 
                'citoglipton', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
                'metformin-rosiglitazone', 'metformin-pioglitazone', 'max_glu_serum', 'A1Cresult', 
                'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
                'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 
                'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton']
df.drop(bad_features, inplace=True, axis =1)

### Replace missing values

In [None]:
df['medical_specialty'] = df['medical_specialty'].replace(['?', ''], 'General')
df['race'] = df['race'].replace(['?', ''], 'Other')
df['readmitted'] = df['readmitted'].replace(['>30', '<30'], 'YES')
df.transpose()

### Categorical to numeric

In [None]:
dummy_columns = ['race', 'gender','age', 'insulin', 'medical_specialty']
df_dummies = pd.get_dummies(df, columns=dummy_columns)

df_target = df['readmitted']

# Drop the target variable from dataframe
df_dummies.drop('readmitted' , inplace = True, axis =1)
df_attr = df_dummies

# Concvert binary categorical values to numeric
df_attr['diabetesMed'] = pd.Categorical(df_attr['diabetesMed']).codes
df_attr['change'] = pd.Categorical(df_attr['change']).codes

df_attr.head()

### Split attribues for training and testing

In [None]:
attr_train, attr_test, target_train, target_test = train_test_split(df_attr , df_target)

### Models
Fit various models and check results

In [None]:
models = [('Decision Tree', DecisionTreeClassifier(max_depth= 5), 'red'), 
          ('Logistic Regression', LogisticRegression(), 'green'), 
          ('Random Forest', RandomForestClassifier(), 'yellow'),
          ('Naive Bayes', GaussianNB(), 'magenta'),
          ('Neural Network', MLPClassifier(hidden_layer_sizes = 5,  max_iter= 500), 'blue')]
         

predicted_results = []
for model_names, model, _ in models:
    
    start = time.time()
    
    model.fit(attr_train, target_train)
    target_predict = model.predict(attr_test)
    
    end = time.time()
    
    time_elapsed.append(end - start)
    predicted_results.append(target_predict)

### Results
Show the results of various models

In [None]:
fpr_list = []
tpr_list = []
confusion_matrix_list = []

for idx, (model_name, model, _) in enumerate(models):
    print_bold(model_name)
    print_bold('Accuracy Score')
    print(accuracy_score(target_test.values, predicted_results[idx]), end='\n\n')
    
    print_bold('Time taken')
    print(round(time_elapsed[idx], 3) , 'ms')
    
    display_confusion_matrix(target_test.values, predicted_results[idx])
    cm = confusion_matrix(target_test.values, predicted_results[idx])
    confusion_matrix_list.append(cm)
    
    print_bold('Classification Report')
    print(classification_report(target_test, predicted_results[idx]))

### Analysis

In [None]:
plot_roc_curve(confusion_matrix_list, models, 'False Positive Rate', 'True Positive Rate')