In [1]:
import pandas as pd

COLUMNS = [ 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity',]
DATA = '/kaggle/input/early-stage-diabetes-risk-prediction/diabetes_data_upload.csv'
TARGET = 'class'

df = pd.read_csv(filepath_or_buffer=DATA)
df['Gender'] = df['Gender'] == 'Male'
for column in COLUMNS:
    df[column] = df[column] == 'Yes'
df[TARGET] = df[TARGET] == 'Positive'
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,True,False,True,False,True,False,False,False,True,False,True,False,True,True,True,True
1,58,True,False,False,False,True,False,False,True,False,False,False,True,False,True,False,True
2,41,True,True,False,False,True,True,False,False,True,False,True,False,True,True,False,True
3,45,True,False,False,True,True,True,True,False,True,False,True,False,False,False,False,True
4,60,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True


Is our target class balanced?

In [2]:
df[TARGET].value_counts().to_dict()

{True: 320, False: 200}

Wow. We have a lot of positives; they outnumber negatives 8:5.

Let's do some dimension reduction and see how our data clusters.

In [3]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS + ['Age']])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-26 18:39:44.273992: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-26 18:39:44.274201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-26 18:39:44.425970: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Fri Jul 26 18:39:55 2024 Construct fuzzy simplicial set
Fri Jul 26 18:39:55 2024 Finding Nearest Neighbors
Fri Jul 26 18:39:59 2024 Finished Nearest Neighbor Search
Fri Jul 26 18:40:02 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Jul 26 18:40:04 2024 Finished embedding
done with UMAP in 0:00:09.719119


In [4]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color=TARGET, facet_col=TARGET)

It would be nice if our positives clustered together and our negatives clustered together but that's not the case. We do see some clustering; let's build a model and see how it does.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=200, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 16 iterations
accuracy: 0.9038


In [6]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.84      0.93      0.88        40
        True       0.95      0.89      0.92        64

    accuracy                           0.90       104
   macro avg       0.90      0.91      0.90       104
weighted avg       0.91      0.90      0.90       104



An accuracy of about 0.9 isn't bad. Let's try a bunch of models and see if any can improve our results.

In [7]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

MODEL = {
    'Naive Bayes': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
    '3 Nearest Neighbors': KNeighborsClassifier(n_neighbors=3),
    '5 Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Linear SVM': SVC(kernel='linear', C=0.025, random_state=2024),
    'Gaussian Process':GaussianProcessClassifier(1.0 * RBF(1.0), random_state=2024),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=2024),
    '10 estimator Random Forest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=2024),
    '20 estimator Random Forest': RandomForestClassifier(max_depth=5, n_estimators=20, max_features=1, random_state=2024),
    'Neural Net': MLPClassifier(alpha=1, max_iter=1000, random_state=2024),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=2024),
    'RBF SVM': SVC(gamma=2, C=1, random_state=2024),
}

result = []
for name, clf in MODEL.items():
    time_start = arrow.now()
    clf.fit(X=X_train, y=y_train)
    score = f1_score(y_true=y_test, y_pred=clf.predict(X=X_test))
    result.append((score, name))
    print('{:5.4f} {} {}'.format(score, arrow.now() - time_start, name))
result = sorted(result, key=lambda x: x[0], reverse=True)
print('best: {} {}'.format(result[0][0], result[0][1]))

0.8889 0:00:00.007407 Naive Bayes
0.9692 0:00:00.015931 QDA
0.9764 0:00:00.014517 3 Nearest Neighbors
0.9677 0:00:00.019844 5 Nearest Neighbors
0.9000 0:00:00.018281 Linear SVM
0.9844 0:00:03.063282 Gaussian Process
0.9355 0:00:00.024970 Decision Tree
0.9606 0:00:00.081022 10 estimator Random Forest
0.9524 0:00:00.044222 20 estimator Random Forest
0.9355 0:00:00.379397 Neural Net
0.9016 0:00:00.138886 AdaBoost
0.9692 0:00:00.014163 RBF SVM
best: 0.984375 Gaussian Process


This looks like a big improvement; this looks like about a 10% relative improvement.

In [8]:
gauss = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=2024).fit(X=X_train, y=y_train)
print(classification_report(zero_division=0, y_true=y_test, y_pred=gauss.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.97      0.97      0.97        40
        True       0.98      0.98      0.98        64

    accuracy                           0.98       104
   macro avg       0.98      0.98      0.98       104
weighted avg       0.98      0.98      0.98       104



This is substantially better.