In [139]:
import pandas as pd 
import numpy as np
from sklearn .model_selection  import train_test_split
from sklearn .linear_model import LogisticRegression
from sklearn .compose import ColumnTransformer
from sklearn .preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.preprocessing  import FunctionTransformer
from sklearn .pipeline import Pipeline
from sklearn .impute import SimpleImputer
from sklearn .tree import DecisionTreeClassifier

In [2]:
data=pd.read_csv(r"C:\Users\mdfir\Downloads\diabetes_dataset.csv")

In [3]:
data.shape

(100000, 31)

In [4]:
data.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.2,23.5,Type 2,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   age                                 100000 non-null  int64  
 1   gender                              100000 non-null  object 
 2   ethnicity                           100000 non-null  object 
 3   education_level                     100000 non-null  object 
 4   income_level                        100000 non-null  object 
 5   employment_status                   100000 non-null  object 
 6   smoking_status                      100000 non-null  object 
 7   alcohol_consumption_per_week        100000 non-null  int64  
 8   physical_activity_minutes_per_week  100000 non-null  int64  
 9   diet_score                          100000 non-null  float64
 10  sleep_hours_per_day                 100000 non-null  float64
 11  screen_time_hours_per_day  

In [6]:
data.isnull().sum()

age                                   0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
employment_status                     0
smoking_status                        0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
glucose_fasting                       0


In [7]:
data.head(2)

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0


In [8]:
data["gender"].value_counts()

gender
Female    50216
Male      47771
Other      2013
Name: count, dtype: int64

In [9]:
data["ethnicity"].value_counts()

ethnicity
White       44997
Hispanic    20103
Black       17986
Asian       11865
Other        5049
Name: count, dtype: int64

In [10]:
data["education_level"].value_counts()

education_level
Highschool      44891
Graduate        35037
Postgraduate    14972
No formal        5100
Name: count, dtype: int64

In [11]:
data["income_level"].value_counts()

income_level
Middle          35152
Lower-Middle    25150
Upper-Middle    19866
Low             14830
High             5002
Name: count, dtype: int64

In [12]:
data["diabetes_stage"].value_counts()

diabetes_stage
Type 2          59774
Pre-Diabetes    31845
No Diabetes      7981
Gestational       278
Type 1            122
Name: count, dtype: int64

In [13]:
x=data.iloc[:,0:30]
y=data.iloc[:,-1]

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=42)

In [15]:
x_train.head(1)

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage
75220,62,Female,White,Graduate,Upper-Middle,Employed,Never,0,79,7.9,...,130,51,50,112,104,146,6.42,6.28,29.9,Pre-Diabetes


In [16]:
y_train.head(1)

75220    0
Name: diagnosed_diabetes, dtype: int64

# columns transform

In [93]:
step_0=ColumnTransformer([
    ("SimpleImputer",SimpleImputer(strategy='most_frequent'),slice(0,30))
],remainder="passthrough")

In [149]:
step_1=ColumnTransformer([
("onehotencoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"),[1,2,5,6]),

("ordinalencoder",OrdinalEncoder(categories=[['No Diabetes','Pre-Diabetes', 'Type 1','Type 2','Gestational']]),[29]),
    ("OrdinalEncoder_1",OrdinalEncoder(categories=[["No formal","Highschool","Graduate","Postgraduate"]]),[3]),
  ("OrdinalEncoder_2",OrdinalEncoder(categories=[["Low","Lower-Middle","Middle","Upper-Middle","High"]]),[4])
    
],remainder="passthrough")

In [151]:
step_2=ColumnTransformer([
    ("StandardScaler",StandardScaler(),slice(0,41))
])

In [153]:
step_3=FunctionTransformer(func=np.log1p)

In [155]:
step_4=DecisionTreeClassifier()

In [157]:
pip=Pipeline([
    ("step_0",step_0),
("step_1",step_1),
    ("step_2",step_2),
    ("step_3",step_3),
("step_4",step_4),

    
])

In [159]:
pip

0,1,2
,steps,"[('step_0', ...), ('step_1', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('SimpleImputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('onehotencoder', ...), ('ordinalencoder', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['No Diabetes', 'Pre-Diabetes', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['No formal', 'Highschool', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Low', 'Lower-Middle', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,transformers,"[('StandardScaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [161]:
pip.fit(x_train,y_train)

  return func(X, **(kw_args if kw_args else {}))


0,1,2
,steps,"[('step_0', ...), ('step_1', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('SimpleImputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('onehotencoder', ...), ('ordinalencoder', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['No Diabetes', 'Pre-Diabetes', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['No formal', 'Highschool', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Low', 'Lower-Middle', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,transformers,"[('StandardScaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [163]:
pred=pip.predict(x_test)

  return func(X, **(kw_args if kw_args else {}))


In [169]:
from sklearn.metrics import accuracy_score

In [171]:
score=accuracy_score(y_test,pred)

In [173]:
score

0.99935

In [175]:
from sklearn .model_selection import cross_val_score

In [189]:
cvs=cross_val_score(pip,x_train,y_train,cv=5)

  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))


In [190]:
cvs.mean()

0.9992875

In [193]:
import pandas as pd 
import numpy as np 
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing  import FunctionTransformer
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import scipy.stats as stats

0,1,2
,steps,"[('step_0', ...), ('step_1', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('SimpleImputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('onehotencoder', ...), ('ordinalencoder', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['No Diabetes', 'Pre-Diabetes', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['No formal', 'Highschool', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Low', 'Lower-Middle', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,transformers,"[('StandardScaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0
