In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
# Read in the dataset

df = pd.read_csv('diabetes_dataset.csv')
df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  100000 non-null  int64  
 1   gender                100000 non-null  object 
 2   age                   100000 non-null  float64
 3   location              100000 non-null  object 
 4   race:AfricanAmerican  100000 non-null  int64  
 5   race:Asian            100000 non-null  int64  
 6   race:Caucasian        100000 non-null  int64  
 7   race:Hispanic         100000 non-null  int64  
 8   race:Other            100000 non-null  int64  
 9   hypertension          100000 non-null  int64  
 10  heart_disease         100000 non-null  int64  
 11  smoking_history       100000 non-null  object 
 12  bmi                   100000 non-null  float64
 13  hbA1c_level           100000 non-null  float64
 14  blood_glucose_level   100000 non-null  int64  
 15  d

Examining the data:
- 3 categorical columns - 'gender', 'location', 'smoking-history'
- no na values
- our target variable 'diabetes' is already an int
- 0 = does not have diabetes, 1 = does


In [5]:
# Create a baseline RandomForest Model

ohe = OneHotEncoder(drop='first', dtype='int')
model = RandomForestClassifier(random_state=42)


In [6]:
# Drop columns that won't be useful
# year - year the data was collected
# location

dropped_df = df.drop(columns=['location', 'year'])

In [7]:
# Get our target feature

X = dropped_df.drop(columns='diabetes')
y = df['diabetes']

# Split into our test and train sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)

In [8]:
# Encode 'gender' and 'smoking_history'

# Define the columns to encode
columns_to_encode = ['gender', 'smoking_history']

# Create the ColumnTransformer
ct = ColumnTransformer([
    ('encoder', OneHotEncoder(handle_unknown='ignore'), columns_to_encode)
], remainder='passthrough')

# Fit on the training data and transform both training and test data
X_train_encoded = ct.fit_transform(X_train)
X_test_encoded = ct.transform(X_test)

In [9]:
# Fit our model

model.fit(X_train_encoded, y_train)

In [10]:
# Make predictions

predictions = model.predict(X_test_encoded)

In [11]:
# Score the baseline model

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Print detailed classification report
print(classification_report(y_test, predictions))

# Print confusion matrix
print(confusion_matrix(y_test, predictions))

Accuracy: 0.96872
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22839
           1       0.96      0.66      0.79      2161

    accuracy                           0.97     25000
   macro avg       0.97      0.83      0.88     25000
weighted avg       0.97      0.97      0.97     25000

[[22784    55]
 [  727  1434]]


Evaluation - Good accuracy score without any cleaning. Lower performance on the minority class.

In [16]:
# Exploration and Cleanup

print(df.columns)

# Rename columns 
new_names = {
    'year': 'Year',
    'gender': 'Gender',
    'age' : 'Age',
    'location' :'Location',
    'race:AfricanAmerican' : "Race : African American",
    'race:Asian' : 'Race : Asian',
    'race:Caucasian' : 'Race : Caucasian',
    'race:Hispanic' : 'Race : Hispanic',
    'race:Other' : 'Race : Other',
    'hypertension' : 'Hypertension',
    'heart_disease' : 'Heart Disease',
    'smoking_history' : 'Smoking History',
    'bmi' : 'BMI',
    'hbA1c_level' : 'hbA1c Level',
    'blood_glucose_level' : 'Blood Glucose Level',
    'diabetes' : 'Diabetes'
}
df = df.rename(columns=new_names)

df.columns

Index(['year', 'gender', 'age', 'location', 'race:AfricanAmerican',
       'race:Asian', 'race:Caucasian', 'race:Hispanic', 'race:Other',
       'hypertension', 'heart_disease', 'smoking_history', 'bmi',
       'hbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')


Index(['Year', 'Gender', 'Age', 'Location', 'Race : African American',
       'Race : Asian', 'Race : Caucasian', 'Race : Hispanic', 'Race : Other',
       'Hypertension', 'Heart Disease', 'Smoking History', 'BMI',
       'hbA1c Level', 'Blood Glucose Level', 'Diabetes'],
      dtype='object')

In [14]:
# Checking for class imbalance

df['diabetes'].value_counts(normalize=True)

diabetes
0    0.915
1    0.085
Name: proportion, dtype: float64

In [15]:
# Examine catgoric columns

print(df['smoking_history'].value_counts())

df['gender'].value_counts(normalize=True)


smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64


gender
Female    0.58552
Male      0.41430
Other     0.00018
Name: proportion, dtype: float64

- smoking history has a "No Info" category