In [14]:
# Download and Clean the Data
import numpy as np
import pandas as pd
import zipfile
import urllib.request
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip'
filename = 'dataset_diabetes.zip'
urllib.request.urlretrieve(url, filename)

with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall('data')

diabetic_data = pd.read_csv('data/dataset_diabetes/diabetic_data.csv', header=0, quotechar='"')

# drop columns with more than 30% missing values
diabetic_data = diabetic_data.drop(['weight', 'payer_code', 'medical_specialty'], axis=1)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
le = LabelEncoder()


label_encoding_columns = ['race', 'gender', 'diag_1', 'diag_2', 'diag_3',
            'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
            'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
            'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
            'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
            'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
            'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

for col in label_encoding_columns:
    diabetic_data[col] = le.fit_transform(diabetic_data[col])

# create binary values for readmitted column
diabetic_data['readmitted'] = diabetic_data['readmitted'].apply(lambda x: 0 if x in ['NO', '>30'] else 1)


oe = OrdinalEncoder(categories=[['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)']], dtype=int)
diabetic_data['age'] = oe.fit_transform(diabetic_data[['age']])

seed = 123
np.random.seed(seed) 


In [15]:
# Create 20% testing and 80% training data set split
X = diabetic_data.drop(columns=['readmitted'])
Y = diabetic_data['readmitted']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=seed)

In [16]:
# Logistic Regression -> hw3
from sklearn.linear_model import LogisticRegression

# create a logistic regression model
model = LogisticRegression()

# fit the model to the training data
model.fit(X_train, y_train)

# make predictions on the test data
y_pred = model.predict(X_test)

# evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')


Accuracy: 0.8887273013127899


In [3]:
# SVM Classifier (TODO)


(81412, 49)


In [17]:
# Neural Network Classifier

import pandas as pd
import zipfile
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = MLPClassifier(hidden_layer_sizes=(16, 8), activation='relu', solver='adam', alpha=0.001, max_iter=20, validation_fraction=0.1, random_state=seed)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')


Accuracy: 0.8885307758823991




In [18]:
# Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=seed)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Accuracy: 0.8888059114849461
