In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import plotly.express as px


https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset/data

https://www.kaggle.com/code/alexteboul/diabetes-health-indicators-dataset-notebook

https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf

We want to... predict if someone is pre-diabetic or has diabetes from certain health metrics

### Step 1: Import Data
___

In [2]:
diabetes_df = pd.read_csv('./Data/CUSTOM_diabetes_012_health_indicators_BRFSS2015.csv')

#### Summarize what each column means

### Step 2: Explore & Summarize Data
___

In [3]:
diabetes_df['Diabetes_012'].value_counts()

Diabetes_012
0.0    208156
2.0     34340
1.0      4476
Name: count, dtype: int64

In [4]:
diabetes_df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,Arthritis,Depression,Race_white,Race_black,Race_AMI_AKN,Race_asian,Race_HI_PI,Race_other,Race_multi,Race_hispanic
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1,0,0,0,0,0,0,0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1,0,0,0,0,0,0,0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1,0,0,0,0,0,0,0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0,0,0,0,0,0,1,0


In [5]:
diabetes_df.describe()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,Arthritis,Depression,Race_white,Race_black,Race_AMI_AKN,Race_asian,Race_HI_PI,Race_other,Race_multi,Race_hispanic
count,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,...,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0,246972.0
mean,0.296212,0.429239,0.424226,0.963672,28.37933,0.441908,0.040162,0.093492,0.75824,0.63504,...,0.356478,0.199043,0.803119,0.071996,0.012876,0.020006,0.002964,0.00362,0.017962,0.067457
std,0.697539,0.494969,0.494226,0.187105,6.592675,0.496615,0.19634,0.291122,0.428151,0.48142,...,0.47896,0.399281,0.397642,0.258482,0.11274,0.140022,0.054361,0.060056,0.132812,0.250812
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Step 3: Clean Data / Adjust For Imbalance
___

In [6]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [7]:
#split the data 
X = diabetes_df.drop('Diabetes_012', axis=1)
y = diabetes_df['Diabetes_012']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [8]:
#oversample the minority class
ros = RandomOverSampler(random_state=0)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

In [9]:
#undersample the majority class
rus = RandomUnderSampler(random_state=0)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

### Step 4: Prepare for Modeling
Split, Scale, etc.
___

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA    

In [11]:
# scaler_ros = StandardScaler().fit(X_oversampled)
# X_oversampled_scaled = scaler_ros.transform(X_oversampled)
# x_test_scaled_ros = scaler_ros.transform(X_test)

# scaler_rus = StandardScaler().fit(X_undersampled)
# X_undersampled_scaled = scaler_rus.transform(X_undersampled)
# x_test_scaled_rus = scaler_rus.transform(X_test)

### Step 5: Test Various models with default params
___

In [12]:
#Import unsupervised models
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier

In [13]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score

In [18]:
lr_ros = LogisticRegression(max_iter=10000).fit(X_oversampled, y_oversampled)

lr_rus = LogisticRegression(max_iter=10000).fit(X_undersampled, y_undersampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
print(classification_report(y_oversampled, lr_rus.predict(X_oversampled)))

              precision    recall  f1-score   support

         0.0       0.61      0.66      0.63    156066
         1.0       0.45      0.34      0.39    156066
         2.0       0.51      0.59      0.55    156066

    accuracy                           0.53    468198
   macro avg       0.52      0.53      0.52    468198
weighted avg       0.52      0.53      0.52    468198



In [16]:
print(classification_report(y_test, lr_ros.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78     52090
         1.0       0.03      0.34      0.06      1091
         2.0       0.35      0.58      0.44      8562

    accuracy                           0.65     61743
   macro avg       0.45      0.53      0.43     61743
weighted avg       0.85      0.65      0.72     61743



In [17]:
print(classification_report(y_undersampled, lr_rus.predict(X_undersampled_scaled)))

NameError: name 'X_undersampled_scaled' is not defined

In [None]:
print(classification_report(y_test, lr_rus.predict(x_test_scaled_rus)))

              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78     52090
         1.0       0.03      0.33      0.06      1091
         2.0       0.35      0.58      0.44      8562

    accuracy                           0.65     61743
   macro avg       0.44      0.52      0.43     61743
weighted avg       0.85      0.65      0.72     61743



In [None]:
#random trees
rf_ros = RandomForestClassifier().fit(X_oversampled_scaled, y_oversampled)

rf_rus = RandomForestClassifier().fit(X_undersampled_scaled, y_undersampled)

In [None]:
#classification report
print(classification_report(y_test, rf_ros.predict(x_test_scaled_ros)))

              precision    recall  f1-score   support

         0.0       0.88      0.94      0.91     52090
         1.0       0.01      0.00      0.00      1091
         2.0       0.44      0.31      0.37      8562

    accuracy                           0.83     61743
   macro avg       0.44      0.42      0.43     61743
weighted avg       0.80      0.83      0.82     61743



In [None]:
#rus classification report
print(classification_report(y_test, rf_rus.predict(x_test_scaled_rus)))

              precision    recall  f1-score   support

         0.0       0.95      0.61      0.74     52090
         1.0       0.03      0.38      0.05      1091
         2.0       0.33      0.56      0.42      8562

    accuracy                           0.60     61743
   macro avg       0.44      0.52      0.41     61743
weighted avg       0.85      0.60      0.69     61743



### Step 6: Select Preferred Model & Optimize
___

In [None]:
from sklearn.model_selection import GridSearchCV


#### Step X: PCA, confidence, various accuracy metrics, (random forest impacts), feature correlation, unsupervised / supervised approach

Separate Parallel tasks:
* pca
* correlation
* clustering