In [None]:
import numpy as np
import pandas as pd #import pandas to manipulate the dataset
from matplotlib import pyplot as plt #import the module matplotlib.pyplot to do visulization
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures    # function to generate polynomial and interaction features
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error  # evaluation metrics
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns  #data visualization library

In [None]:
# Read in the data stored in the file 'kidney_disease.csv'
df = pd.read_csv('kidney_disease.csv')

# Drop columns that are not used
df.drop(columns=['id','sg', 'su','pc','dm','pcc', 'cad','ba', 'bgr', 'sod', 'rbc','pot', 'wc', 'pe', 'ane','sc', 'al', 'hemo', 'htn', 'rc', 'pcv'],inplace=True)  # drop unrelevant columns

# Print the first 5 rows of the DataFrame 'df'
# the classification column is the label, CKD = Chronic Kidney Disease
df.head(5) 

In [None]:
# Choosing Datapoints: People in 45-55 age range
df = df[df["age"] >= 45]
df = df[df["age"] <= 55]
df = df.reset_index(drop=True) # re-indexing

# Change label to numeric value: 1 represents a person have CKD, otherwise 0.
df = df.replace('ckd', 1)
df = df.replace('notckd', 0)
df = df.dropna()

print(df)

In [None]:
# Define feature and label
# X represents feature, here is 'bu', which means blood urine
X = df['bu'].to_numpy().reshape(-1, 1)

# y represents label, here is 1 or 0, stands for "have CKD" or 'not CKD'
y = df['classification'].to_numpy()

In [None]:
# First, predict the whole data set without spliting into training and validation sets.
clf = LogisticRegression()
clf.fit(X, y)

y_pred = clf.predict(X)
accuracy = accuracy_score(y, y_pred)
print('Training error: ',accuracy)

In [None]:
# Compute confusion matrix to how well the model perform
conf_mat = confusion_matrix(y, y_pred)
# Visualize the confusion matrix 
ax= plt.subplot()
sns.heatmap(conf_mat, annot=True, fmt='g', ax=ax)

ax.set_xlabel('Predicted labels',fontsize=15)
ax.set_ylabel('True labels',fontsize=15)
ax.set_title('Confusion Matrix',fontsize=15)
ax.xaxis.set_ticklabels(['below zero', 'above zero'],fontsize=15)
ax.yaxis.set_ticklabels(['below zero', 'above zero'],fontsize=15)

In [None]:
# Calculate the precision of the model
precision = conf_mat[1,1]/(conf_mat[1,1]+conf_mat[0,1])
print('Precision of the model: ', precision)

In [None]:
# Spliting data into training and validation sets. Use test_size=0.2 so that the test set is 205 of the whole dataset
X_perf, X_test, y_perf, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Check the length and see if each set have both class for fairness.
print(len(X_perf), y_perf) # 69 datapoints
print(len(X_test), y_test) # 18 datapoints

In [None]:
# KFold to split the dataset, k=3 so there will be 3 different sets
k, shuffle, seed = 3, True, 42
kf = KFold(n_splits=k, shuffle=shuffle, random_state=seed)
print(kf)

In [None]:
log_val_acc, log_val_err = [], [] # validation set: accuracy score list and error list of Logistic Regression model
log_train_acc, log_train_err = [], [] # training set: accuracy score list and error list of Logistic Regression model
tree_train_acc, tree_train_err =[], [] # training set: accuracy score list and error list of Decision tree model
tree_val_acc, tree_val_err =[], [] # validation set: accuracy score list and error list of Decision tree model
# K-Fold to split the set. There will be 3 diffent sets
for train_index, val_index in kf.split(X_perf):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    #Logistic Regression
    # fit training set
    clf.fit(X_train, y_train)
    
    # predict both sets
    y_pred_train = clf.predict(X_train)
    y_pred_val = clf.predict(X_val)
    
    # calculate accuracy score and error
    acc_val = accuracy_score(y_val, y_pred_val)
    err_log_val = 1 - acc_val
    log_val_acc.append(acc_val) # append accuracy score
    log_val_err.append(err_log_val)
    
    acc_train = accuracy_score(y_train, y_pred_train)
    err_log_train = 1 - acc_train
    log_train_acc.append(acc_train)
    log_train_err.append(err_log_train)
    
    # Decision Tree
    model_tree = DecisionTreeClassifier(max_depth=2, criterion='entropy')
    # fit training set
    model_tree.fit(X_train, y_train)

    # validation set
    y_tree_val_pred = model_tree.predict(X_val)
    acc_tree_val = accuracy_score(y_val, y_tree_val_pred)
    err_tree_val = 1 - acc_tree_val
    tree_val_acc.append(acc_tree_val)
    tree_val_err.append(err_tree_val)
    
    # training set
    y_train_pred = model_tree.predict(X_train)
    acc_tree_train = accuracy_score(y_train, y_train_pred)
    err_tree_train = 1 - acc_tree_train
    
    tree_train_acc.append(acc_tree_train)
    tree_train_err.append(err_tree_train)
    
    

# Print result
print("[Logistic Regression] validation set accuracy score: ", log_val_acc)
print("[Logistic Regression] validation set error score: ", log_val_err)
print("[Logistic Regression] training set accuracy score: " , log_train_acc)
print("[Logistic Regression] training set error score: ", log_train_err)

print("-"*8)

print("[Decision Tree] validation set accuracy score: ", tree_val_acc)
print("[Decision Tree] validation set error score: ", tree_val_err)
print("[Decision Tree] training set accuracy score: " , tree_train_acc)
print("[Decision Tree] training set error score: ", tree_train_err)

print("-"*8)

avg_log_acc = np.average(log_val_acc)
avg_tree_acc = np.average(tree_val_acc)
print("[Logistic Regression] average validation accuracy: ", avg_log_acc)
print("[Decision Tree] average validation accuracy: ", avg_tree_acc)
print("-"*8)

avg_log_acc2 = np.average(log_train_acc)
avg_tree_acc2 = np.average(tree_train_acc)
print("[Logistic Regression] average training accuracy: ", avg_log_acc2)
print("[Decision Tree] average training accuracy: ", avg_tree_acc2)
print("-"*8)

avg_log_err = np.average(log_val_err)
avg_tree_err = np.average(tree_val_err)
print("[Logistic Regression] average validation error: ", avg_log_err)
print("[Decision Tree] average validation error: ", avg_tree_err)
print("-"*8)

avg_log_err2 = np.average(log_train_err)
avg_tree_err2 = np.average(tree_train_err)
print("[Logistic Regression] average training error: ", avg_log_err2)
print("[Decision Tree] average training error: ", avg_tree_err2)
print("-"*8)

In [None]:
fig, ax = plt.subplots()
ax.set_xlabel("Blood Urine")
ax.set_yticks([0,1])
ax.set_ylabel('CKD classification')
ax.set_title("Blood Urine and CKD")
ax.scatter(X[:,0],y,s=50,c="skyblue",label="training datapoints")
X_fit = np.linspace(-25, 25, 100) 
ax.scatter(X_fit, model_tree.predict(X_fit.reshape(-1, 1)),color='r',s=5,label='predicted label ($\hat{y}=1$ if $h(x) > 0$)') 
ax.legend()
plt.show()

In [None]:
# Perform Decision Tree with the test set
y_test_pred = model_tree.predict(X_test)

# 0: As the size of the test set is quite small

#Test Accuracy
test_acc = accuracy_score(y_test, y_test_pred)
print("Test accuracy: ", test_acc)

# Test error
test_err = 1 - test_acc

print("Test error: ", test_err)