# Diabetics Classification

Our dataset contains information about family conditions and personal health data. Based on this health information, we are building a classification model to predict whether a person is at risk of developing diabetics.
 

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns",None)

### Importing Data

In [11]:
diabetics = pd.read_csv('Files/diabetes_dataset.csv')
diabetics.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,7.9,7.9,0,0,0,30.5,0.89,134,78,68,239,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,6.5,8.7,0,0,0,23.1,0.8,129,76,67,116,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,10.0,8.1,1,0,0,22.2,0.81,115,73,74,213,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,6.6,5.2,0,0,0,26.8,0.88,120,93,68,171,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,7.4,5.0,0,0,0,21.2,0.78,92,67,67,210,52,125,160,137,184,12.74,7.2,23.5,Type 2,1


### Data Inspection

In [13]:
diabetics.shape

(100000, 31)

In [14]:
diabetics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   age                                 100000 non-null  int64  
 1   gender                              100000 non-null  object 
 2   ethnicity                           100000 non-null  object 
 3   education_level                     100000 non-null  object 
 4   income_level                        100000 non-null  object 
 5   employment_status                   100000 non-null  object 
 6   smoking_status                      100000 non-null  object 
 7   alcohol_consumption_per_week        100000 non-null  int64  
 8   physical_activity_minutes_per_week  100000 non-null  int64  
 9   diet_score                          100000 non-null  float64
 10  sleep_hours_per_day                 100000 non-null  float64
 11  screen_time_hours_per_day  

Here we are predicting diabetes, so we do not need the columns that already indicate whether a patient has diabetes. 

Therefore, we are removing diabetes_risk_score and diabetes_stage.

In [None]:
diabetics.drop(['diabetes_risk_score','diabetes_stage'],axis=1,inplace = True)

In [47]:
diabetics.describe(percentiles=[0.25,.5,.7,.9,.95,.99])

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diagnosed_diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50.12041,2.00367,118.91164,5.994787,6.997818,5.996468,0.21941,0.2508,0.0792,25.612653,0.856078,115.79961,75.23249,69.63287,185.97811,54.04279,103.00043,121.46265,111.11712,160.03505,9.061242,6.520776,0.59998
std,15.6046,1.417779,84.409662,1.780954,1.094622,2.468406,0.413849,0.433476,0.270052,3.586705,0.046837,14.284073,8.20425,8.371954,32.013005,10.267374,33.390256,43.372619,13.59561,30.935472,4.95406,0.813921,0.489904
min,18.0,0.0,0.0,0.0,3.0,0.5,0.0,0.0,0.0,15.0,0.67,90.0,50.0,40.0,100.0,20.0,50.0,30.0,60.0,70.0,2.0,4.0,0.0
25%,39.0,1.0,57.0,4.8,6.3,4.3,0.0,0.0,0.0,23.2,0.82,106.0,70.0,64.0,164.0,47.0,78.0,91.0,102.0,139.0,5.09,5.97,0.0
50%,50.0,2.0,100.0,6.0,7.0,6.0,0.0,0.0,0.0,25.6,0.86,116.0,75.0,70.0,186.0,54.0,102.0,121.0,111.0,160.0,8.79,6.52,1.0
70%,58.0,3.0,145.0,6.9,7.6,7.3,0.0,0.0,0.0,27.5,0.88,123.0,80.0,74.0,203.0,59.0,120.0,144.0,118.0,176.0,11.63,6.95,1.0
90%,71.0,4.0,232.0,8.3,8.4,9.2,1.0,1.0,0.0,30.2,0.92,135.0,86.0,80.0,227.0,67.0,147.0,178.0,129.0,200.0,15.77,7.57,1.0
95%,76.0,5.0,284.0,9.0,8.8,10.1,1.0,1.0,1.0,31.5,0.93,140.0,89.0,83.0,239.0,71.0,161.0,194.0,134.0,211.0,17.78,7.87,1.0
99%,87.0,6.0,394.0,10.0,9.5,11.8,1.0,1.0,1.0,34.0,0.97,150.0,94.0,89.0,261.0,78.0,185.0,224.0,143.0,233.0,21.45,8.42,1.0


There is no significant difference between the 99th percentile and the maximum value, which indicates that there are no outliers

### Data Preparation and preprocessing

Converting the categorical variables to numericals values using one-hot encoding (dummy features)

In [48]:
diabetics.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,7.9,7.9,0,0,0,30.5,0.89,134,78,68,239,41,160,145,136,236,6.36,8.18,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,6.5,8.7,0,0,0,23.1,0.8,129,76,67,116,55,50,30,93,150,2.0,5.63,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,10.0,8.1,1,0,0,22.2,0.81,115,73,74,213,66,99,36,118,195,5.07,7.51,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,6.6,5.2,0,0,0,26.8,0.88,120,93,68,171,50,79,140,139,253,5.28,9.03,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,7.4,5.0,0,0,0,21.2,0.78,92,67,67,210,52,125,160,137,184,12.74,7.2,1


In [50]:
cat_cols = ['gender','ethnicity','education_level','income_level','employment_status','smoking_status']

dummies = pd.get_dummies(diabetics[cat_cols],drop_first=True).astype(int)
dummies.head()

Unnamed: 0,gender_Male,gender_Other,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,education_level_Highschool,education_level_No formal,education_level_Postgraduate,income_level_Low,income_level_Lower-Middle,income_level_Middle,income_level_Upper-Middle,employment_status_Retired,employment_status_Student,employment_status_Unemployed,smoking_status_Former,smoking_status_Never
0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0
2,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1
3,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1


Merging the two datasets with converted categorical columns into the original dataset and dropping the original columns.

In [52]:
final_diabetics = pd.concat([diabetics,dummies],axis=1)
final_diabetics.drop(cat_cols,axis=1,inplace=True)
final_diabetics.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,family_history_diabetes,hypertension_history,cardiovascular_history,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diagnosed_diabetes,gender_Male,gender_Other,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,education_level_Highschool,education_level_No formal,education_level_Postgraduate,income_level_Low,income_level_Lower-Middle,income_level_Middle,income_level_Upper-Middle,employment_status_Retired,employment_status_Student,employment_status_Unemployed,smoking_status_Former,smoking_status_Never
0,58,0,215,5.7,7.9,7.9,0,0,0,30.5,0.89,134,78,68,239,41,160,145,136,236,6.36,8.18,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
1,48,1,143,6.7,6.5,8.7,0,0,0,23.1,0.8,129,76,67,116,55,50,30,93,150,2.0,5.63,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0
2,60,1,57,6.4,10.0,8.1,1,0,0,22.2,0.81,115,73,74,213,66,99,36,118,195,5.07,7.51,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1
3,74,0,49,3.4,6.6,5.2,0,0,0,26.8,0.88,120,93,68,171,50,79,140,139,253,5.28,9.03,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,46,1,109,7.2,7.4,5.0,0,0,0,21.2,0.78,92,67,67,210,52,125,160,137,184,12.74,7.2,1,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1


In [53]:
final_diabetics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 41 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   age                                 100000 non-null  int64  
 1   alcohol_consumption_per_week        100000 non-null  int64  
 2   physical_activity_minutes_per_week  100000 non-null  int64  
 3   diet_score                          100000 non-null  float64
 4   sleep_hours_per_day                 100000 non-null  float64
 5   screen_time_hours_per_day           100000 non-null  float64
 6   family_history_diabetes             100000 non-null  int64  
 7   hypertension_history                100000 non-null  int64  
 8   cardiovascular_history              100000 non-null  int64  
 9   bmi                                 100000 non-null  float64
 10  waist_to_hip_ratio                  100000 non-null  float64
 11  systolic_bp                

Now our dataset is ready for building model

### Data spliting for model building

In [55]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(final_diabetics,stratify=final_diabetics['diagnosed_diabetes'],train_size=0.7,
                              random_state=87)

In [None]:
print(train.diagnosed_diabetes.value_counts(normalize=True))
print(test.diagnosed_diabetes.value_counts(normalize=True))

diagnosed_diabetes
1    0.599986
0    0.400014
Name: proportion, dtype: float64
diagnosed_diabetes
1    0.599967
0    0.400033
Name: proportion, dtype: float64


The data was properly split into training and testing sets.

In [None]:
X_train = 