In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix 
import joblib
import pickle
import matplotlib

In [24]:
df=pd.read_csv('diabetes1.csv')

In [25]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,111.8939,53.588416,32.789797,123.380177,31.097706,0.46216,45,0
1,0,136.416478,72.594942,35.44514,139.550451,32.076151,0.807133,28,1
2,2,135.808122,63.744751,5.0,15.0,31.060984,0.541002,50,0
3,7,99.926368,73.510164,28.299795,224.089913,24.144927,0.459101,40,1
4,2,168.933776,58.240023,36.646652,150.31959,32.51753,0.45448,30,1


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               4000 non-null   int64  
 1   Glucose                   4000 non-null   float64
 2   BloodPressure             4000 non-null   float64
 3   SkinThickness             4000 non-null   float64
 4   Insulin                   4000 non-null   float64
 5   BMI                       4000 non-null   float64
 6   DiabetesPedigreeFunction  4000 non-null   float64
 7   Age                       4000 non-null   int64  
 8   Outcome                   4000 non-null   int64  
dtypes: float64(6), int64(3)
memory usage: 281.4 KB


In [27]:
df.drop('Pregnancies',axis=1,inplace=True)

In [28]:
x = df.drop(['Outcome'],axis = 1)
y = df['Outcome']

In [29]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

In [30]:
pipeline=Pipeline([
    ('robustscalar',RobustScaler()),
    ('model',RandomForestClassifier(random_state=42))
])

In [31]:
pipeline

0,1,2
,steps,"[('robustscalar', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
param_grid = {
    "model__n_estimators":[100, 200, 300],
    "model__max_depth":[None, 8, 12],
    "model__min_samples_split":[2, 5, 10]
}

In [33]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

In [34]:
grid.fit(xtrain,ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 8, ...], 'model__min_samples_split': [2, 5, ...], 'model__n_estimators': [100, 200, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
model=grid.best_estimator_

In [36]:
model

0,1,2
,steps,"[('robustscalar', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
grid.best_params_

{'model__max_depth': None,
 'model__min_samples_split': 2,
 'model__n_estimators': 300}

In [38]:
ypred=model.predict(xtest)

In [39]:
model.score(xtrain,ytrain)

1.0

In [40]:
model.score(xtest,ytest)

0.99625

In [41]:
accuracy_score(ytest,ypred)

0.99625

In [42]:
classification_report(ytest,ypred)

'              precision    recall  f1-score   support\n\n           0       1.00      0.99      1.00       398\n           1       1.00      1.00      1.00       402\n\n    accuracy                           1.00       800\n   macro avg       1.00      1.00      1.00       800\nweighted avg       1.00      1.00      1.00       800\n'

In [43]:
joblib.dump(model, "model.pkl")

['model.pkl']

In [44]:
with open("model.pkll", "wb") as f:
    pickle.dump(model, f)