In [1]:
import pandas as pd

COLUMNS = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education','Income']
DATA = '/kaggle/input/cdc-diabetes-health-indicators/diabetes_binary_5050split_health_indicators_BRFSS2015.csv'
TARGET = 'Diabetes_binary'


df = pd.read_csv(filepath_or_buffer=DATA)
df[TARGET] = df[TARGET] > 0
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,False,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,False,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,False,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,False,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,False,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


We know already from the data card that the target class is balanced.

Let's use dimension reduction and view a sample to see if positive and negative instances are similar or different from one another.

In [2]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-27 20:12:52.080902: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-27 20:12:52.081062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-27 20:12:52.278049: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sat Jul 27 20:13:04 2024 Construct fuzzy simplicial set
Sat Jul 27 20:13:04 2024 Finding Nearest Neighbors
Sat Jul 27 20:13:04 2024 Building RP forest with 18 trees
Sat Jul 27 20:13:11 2024 NN descent for 16 iterations
	 1  /  16
	 2  /  16
	 3  /  16
	Stopping threshold met -- exiting after 3 iterations
Sat Jul 27 20:13:35 2024 Finished Nearest Neighbor Search
Sat Jul 27 20:13:39 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Jul 27 20:16:10 2024 Finished embedding
done with UMAP in 0:03:05.883616


In [3]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df.sample(n=1000, random_state=2024), x='x', y='y', color=TARGET, facet_col=TARGET)

IF we take a sample it looks like there's a signal in the data; if we plot all 71K points the signal is hard to see. Let's build a model.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=1000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 377 iterations
accuracy: 0.7445


Is an accuracy of 0.75 good or bad? Because our classes are balanced we can be confident it is better than nothing. Let's look at the classification report.

In [5]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.75      0.73      0.74      7070
        True       0.74      0.76      0.75      7069

    accuracy                           0.74     14139
   macro avg       0.74      0.74      0.74     14139
weighted avg       0.74      0.74      0.74     14139



The good/bad news is that our model is getting roughly the same results for both classes. Can we do better with another classifier?

In [6]:
from sklearn.neural_network import MLPClassifier

neural = MLPClassifier(alpha=1, max_iter=1000, random_state=2024).fit(X=X_train, y=y_train)
print(classification_report(zero_division=0 , y_true=y_test, y_pred=neural.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.80      0.65      0.72      7070
        True       0.71      0.84      0.77      7069

    accuracy                           0.75     14139
   macro avg       0.76      0.75      0.74     14139
weighted avg       0.76      0.75      0.74     14139



We can do maybe better with an sklearn neural net, but at a cost of less balanced recall.