In [1]:
import pandas as pd

DATA = '/kaggle/input/100000-diabetes-clinical-dataset/diabetes_dataset.csv'
TARGET = 'diabetes'

df = pd.read_csv(filepath_or_buffer=DATA)
df[TARGET] = df[TARGET] == 1
df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,False
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,False
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,False
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,False
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,False


In [2]:
df['year'].value_counts().to_dict()

{2019: 79745, 2015: 8760, 2016: 8760, 2018: 2678, 2020: 42, 2022: 8, 2021: 7}

Almost all of our data is from one year, which is odd but probably doesn't matter.

In [3]:
df[TARGET].value_counts(normalize=True).to_dict()

{False: 0.915, True: 0.085}

Fortunately for society diabetes is relatively rare, but unfortunately for us it is also rare in our data, which may make building a model to find it difficult.

In [4]:
from plotly import express

express.histogram(data_frame=df, x='smoking_history')

We have smoking history data for a majority of our data, but missing data is probably a data quality issue.

In [5]:
import arrow
from umap import UMAP

COLUMNS = [key for key, dtype in df.dtypes.to_dict().items() if str(dtype) in {'bool', 'float64', 'int64'} and key != TARGET]

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-24 17:53:11.886014: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-24 17:53:11.886203: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-24 17:53:12.127445: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Wed Jul 24 17:53:26 2024 Construct fuzzy simplicial set
Wed Jul 24 17:53:26 2024 Finding Nearest Neighbors
Wed Jul 24 17:53:26 2024 Building RP forest with 21 trees
Wed Jul 24 17:53:34 2024 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	 3  /  17
	Stopping threshold met -- exiting after 3 iterations
Wed Jul 24 17:54:02 2024 Finished Nearest Neighbor Search
Wed Jul 24 17:54:06 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jul 24 17:59:19 2024 Finished embedding
done with UMAP in 0:05:52.473184


In [6]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df.sample(n=3000, random_state=2024), x='x', y='y', color=TARGET, facet_col=TARGET)

This is fascinating; our postitives mostly cluster differently than our negatives. Let's build a model.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=200, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 108 iterations
accuracy: 0.9594


In a dataset with 8% positives an accuracy of 0.96 could be good or bad. Let's look at the classification report to see which one we have.

In [8]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98     18300
        True       0.86      0.62      0.72      1700

    accuracy                           0.96     20000
   macro avg       0.91      0.81      0.85     20000
weighted avg       0.96      0.96      0.96     20000



Not surprisingly our positive class is harder to find than the negative class. 

In [9]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

Let's see if we can do better with a more complicated model.

In [10]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=2, random_state=2024).fit(X=X_train, y=y_train)
print(classification_report(zero_division=0 , y_true=y_test, y_pred=tree.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.97      1.00      0.99     18300
        True       1.00      0.67      0.81      1700

    accuracy                           0.97     20000
   macro avg       0.99      0.84      0.90     20000
weighted avg       0.97      0.97      0.97     20000



We can do better with a decision tree model, but this is still a hard problem.