In [None]:
# pip install mlxtend #un-comment to run install if running for first time.
# !pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("diabetic_data.csv",header=None)
df.columns = ['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamid', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'
]
df.drop(index=df.index[0], axis=0, inplace=True)

data = df.copy()
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns")
print(data.describe())
print(data.head())
print(data.dtypes.value_counts())

In [None]:
data['weight'] = pd.to_numeric(data['weight'],errors='coerce') # weight in pounds
data['time_in_hospital'] = pd.to_numeric(data['time_in_hospital'],errors='coerce') # number of days between admission and discharge
data['num_lab_procedures'] = pd.to_numeric(data['num_lab_procedures'],errors='coerce') # number of lab test during the encounter
data['num_procedures'] = pd.to_numeric(data['num_procedures'],errors='coerce') # number of procedures other than lab test, during the encounter
data['num_medications'] = pd.to_numeric(data['num_medications'],errors='coerce') # number of distinct generic names administered during the encounter
data['number_outpatient'] = pd.to_numeric(data['number_outpatient'],errors='coerce') # number of outpatient visits in the year preceding the encounter
data['number_emergency'] = pd.to_numeric(data['number_emergency'],errors='coerce') # number of emergency visits in the year preceding the encounter
data['number_inpatient'] = pd.to_numeric(data['number_inpatient'],errors='coerce') # number of inpatient visits in the year preceding the encounter
data['number_diagnoses'] = pd.to_numeric(data['number_diagnoses'],errors='coerce') # number of diagnoses entered to the system 

In [None]:
print(data.dtypes.value_counts())

In [None]:
numeric_attributes = data.columns[data.dtypes!="object"]
categorical_attributes =data.columns[data.dtypes=="object"]
print(numeric_attributes)
print(categorical_attributes)

# Handling Missing Values

## Approach C - Replacing missing values with a user defined constant

In [None]:
print('Before replacing missing values:')
data.head(20)

### Replacing missing values with a user defined constant

In [None]:
data_1 = data.copy()
data_1 = data_1.fillna("?")
data_1 = data_1.replace("?", pd.NA)

### Changing current values with a user defined constant

In [None]:
data_1["admission_type_id"] = data_1["admission_type_id"].replace("1", "Emergency")
data_1["admission_type_id"] = data_1["admission_type_id"].replace("2", "Urgent")
data_1["admission_type_id"] = data_1["admission_type_id"].replace("3", "Elective")
data_1["admission_type_id"] = data_1["admission_type_id"].replace("4", "Newborn")
data_1["admission_type_id"] = data_1["admission_type_id"].replace("5", "Trauma Center")
data_1["admission_type_id"] = data_1["admission_type_id"].replace("6", "Transfer")
data_1["admission_type_id"] = data_1["admission_type_id"].replace("7", "Hospice")
data_1["admission_type_id"] = data_1["admission_type_id"].replace("8", "Unknown")

data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("1", "Home")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("2", "Short Term Hospital")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("3", "SNF")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("4", "ICF")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("5", "Inpatient Care")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("6", "Home w/ Home Health Service")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("7", "Left Against Medical Advice")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("8", "Home Under Care")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("9", "Admitted as an inpatient")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("10", "Nenonate Discharge/Neonatal Aftercare")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("11", "Expired")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("12", "Expected Return for Outpatient Services")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("13", "Hospice/Home")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("14", "Hospice/Medical Facility")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("15", "Medicare-approved Swing Bed")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("16", "Transferred for Outpatient Services")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("17", "Referred for Outpatient Services")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("18", "NULL")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("19", "Expired at Home")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("20", "Expired at Medical facility")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("21", "Expired, Place Unknown")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("22", "Rehabilitation Facility")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("23", "Long Term Care")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("24", "Nursing Facility-Medicaid Certified")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("25", "Not Mapped")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("26", "Unknown/Invalid")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("27", "Federal Health Facility")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("28", "Psychiatric Hospital/Unit")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("29", "Critical Access Hospital")
data_1["discharge_disposition_id"] = data_1["discharge_disposition_id"].replace("30", "Health Care Facility not Defined Elsewhere")

# data_1["A1Cresult"] = data_1["A1Cresult"].replace(">7", "Moderate Risk")
# data_1["A1Cresult"] = data_1["A1Cresult"].replace(">8", "High Risk")


print('\After replacing missing values:')
data_1.head(50)

## Approach A - Removal of rows containing missing values

In [None]:
print(f"There are {data_1.isnull().sum().sum()} missing values in this dataset")

print('Number of instances = %d'%(data_1.shape[0]))
print('Number of attributes = %d'%(data_1.shape[1]))

print('Number of missing values:')
for col in data_1.columns:
    print('\t%s:%d'%(col,data_1[col].isna().sum()))

data_1.head(20)

In [None]:
data_1 = data_1.dropna(subset=["race", "diag_1", "diag_2", "diag_3"])

In [None]:
print(f"There are {data_1.isnull().sum().sum()} missing values in this dataset")

print('Number of instances = %d'%(data_1.shape[0]))
print('Number of attributes = %d'%(data_1.shape[1]))

print('Number of missing values:')
for col in data_1.columns:
    print('\t%s:%d'%(col,data_1[col].isna().sum()))
    
print(data_1.describe())
print(data_1.head())

## Approach D - Removal of attributes containing missing values

In [None]:
print('Number of missing values:')
for col in data_1.columns:
    print('\t%s:%d'%(col,data_1[col].isna().sum()))

In [None]:
data_1.drop('weight', axis=1, inplace=True) # 98053 records with missing values, deemed unusable
data_1.drop('payer_code', axis=1, inplace=True) # 38924 records with missing values, deemed unusable
data_1.drop('medical_specialty', axis=1, inplace=True) # 48318 records with missing values, deemed un

In [None]:
print('Number of missing values after removal:')
for col in data_1.columns:
    print('\t%s:%d'%(col,data_1[col].isna().sum()))
    
data_1.head(20)

## Classification - Naive Bayes

### Create data subset

In [None]:
# Some work was tried to create subset first before changing category names, but errors happened that were not easy to fix.
# Changing category names first was a way to avoid the errors, and still keep the data integrity.
class_data_1 = data_1.copy()
class_data_1.head()

#### Change category names

In [None]:
class_data_1["A1Cresult"] = class_data_1["A1Cresult"].replace(">7", "Moderate Risk")
class_data_1["A1Cresult"] = class_data_1["A1Cresult"].replace(">8", "High Risk")
class_data_1["A1Cresult"] = class_data_1["A1Cresult"].replace("Norm", "Normal")
class_data_1["A1Cresult"] = class_data_1["A1Cresult"].replace("None", pd.NA)

In [None]:
class_data_1 = class_data_1.dropna(subset=["A1Cresult"])

In [None]:
class_data_1.head()

In [None]:
class_data_1 = class_data_1[['age','race','gender','admission_type_id','time_in_hospital','num_procedures','A1Cresult','diabetesMed']]

#### Create data copy for later if needed

In [None]:
class_data_2 = class_data_1.copy()

#### Change data into correct format

##### Bin numerical data

In [None]:
class_data_2.describe()

In [None]:
time_in_hospital_bins = [0, 3, 7, 14]  # Define bin edges based on quartiles
class_data_2['time_in_hospital'] = pd.cut(class_data_2['time_in_hospital'], bins=time_in_hospital_bins, labels=["Low", "Medium", "High"])

In [None]:
num_procedure_bins = [-0.1, 2.5, 6.1]  # Define bin edges based on quartiles
class_data_2['num_procedures'] = pd.cut(class_data_2['num_procedures'], bins=num_procedure_bins, labels=["Low", "High"])

In [None]:
class_data_2.describe()

### Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

X = class_data_2[['age','race','gender','admission_type_id','time_in_hospital','num_procedures','diabetesMed']]
Y = class_data_2['A1Cresult']

X_encoded = pd.get_dummies(X)

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, Y, test_size=0.20, random_state=42)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

#Make predictions on the testing data
y_pred = gnb.predict(X_test)

#Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of GNB classifier on testing set:', accuracy)


In [None]:
from sklearn.model_selection import KFold

X_encoded = pd.get_dummies(X)

gnb = GaussianNB()

# Define number of folds for cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize a list to store accuracy scores for each fold
accuracy_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_encoded):
    X_train, X_test = X_encoded.iloc[train_index], X_encoded.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    # Fit the classifier on the training data
    gnb.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = gnb.predict(X_test)
    
    # Calculate accuracy for this fold and store it
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

# Calculate the mean accuracy across all folds
mean_accuracy = np.mean(accuracy_scores)
print('Mean accuracy of GNB classifier with', num_folds, 'fold cross-validation:', mean_accuracy)


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define number of folds for cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize a list to store accuracy scores for each fold
accuracy_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_encoded):
    X_train, X_test = X_encoded.iloc[train_index], X_encoded.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    # Fit the classifier on the training data
    dt_classifier.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = dt_classifier.predict(X_test)
    
    # Calculate accuracy for this fold and store it
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

# Calculate the mean accuracy across all folds
mean_accuracy = np.mean(accuracy_scores)
print('Mean accuracy of Decision Tree classifier with', num_folds, 'fold cross-validation:', mean_accuracy)


In [29]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define number of folds for cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize a list to store accuracy scores for each fold
accuracy_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_encoded):
    X_train, X_test = X_encoded.iloc[train_index], X_encoded.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    # Fit the classifier on the training data
    dt_classifier.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = dt_classifier.predict(X_test)
    
    # Calculate accuracy for this fold and store it
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

# Calculate the mean accuracy across all folds
mean_accuracy = np.mean(accuracy_scores)
print('Mean accuracy of Decision Tree classifier with', num_folds, 'fold cross-validation:', mean_accuracy)


Mean accuracy of Decision Tree classifier with 5 fold cross-validation: 0.4713144369854657
