# End to end machine learning with deployment 

# data exploration 

In [1]:
# import the libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')



from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV,GridSearchCV
from imblearn.over_sampling import SMOTE

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier


from sklearn.svm import SVC

from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score,roc_auc_score,\
f1_score,confusion_matrix,ConfusionMatrixDisplay,classification_report




In [3]:
import shap 


import pickle 

import streamlit as st

In [4]:
data=pd.read_csv(r'diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,Yes
1,1,85,66,29,0,26.6,0.351,31,No
2,8,183,64,0,0,23.3,0.672,32,Yes
3,1,89,66,23,94,28.1,0.167,21,No
4,0,137,40,35,168,43.1,2.288,33,Tested_Positive


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    object 
dtypes: float64(2), int64(6), object(1)
memory usage: 54.1+ KB


In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


# DATA CLEANING 

-- cehck for null values 
-- check for duplicates
--    corrupt characters
-- nonsensical numerical values
-- inconsisitent labels in categorical column 

In [8]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
data.duplicated().sum()

0

In [10]:
data[~data.applymap(np.isreal).any(1)]

TypeError: NDFrame._add_numeric_operations.<locals>.any() takes 1 positional argument but 2 were given

the dataset has 768 rows and 9 columns with non duplicates and corrupt characters 

In [None]:
data['Outcome'].unique()

In [None]:
data.describe().T

There are zeros in minimum. these are missing values. 

In [None]:
data['Outcome'].value_counts()

** we need to impute the zero values in columns 1 to 6 and collapse the differemt labels in outcome into zero and one. 
** impute with median 
** collapse outcome to 0 and 1

In [None]:
# get a copy of data
df=data.copy()

In [None]:
df

In [None]:
zerofill=lambda x: x.replace(0, x.median())
cols=df.columns[1:6]
df[cols]=df[cols].apply(zerofill,0)

In [None]:
d={'Yes':1,'Tested_Positive':1,'No':0,'Tested_Negative':0}
df['Outcome']=df['Outcome'].map(d)

In [None]:
df

In [None]:
df['Outcome'].value_counts()

# EDA

In [None]:
-- univatiate analysis 
-- bivariate analysis
-- scatter plots 
-- correlation matrix
-- heat maps

In [None]:
df.hist()
plt.tight_layout()
plt.show()

In [None]:
we see that pregnancies, skinthickness,diabeticspedigreefunction,age are right skewed. 

# before transforming check with the domain experts 

In [None]:
for i in df.select_dtypes(exclude=object).columns:
    print('boxplot for the columns',i)
    sns.boxplot(data=df, x=i)
    plt.show()


In [None]:
ax=sns.countplot(x=df['Outcome'], color='green')
for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate("{:.3g}%".format(100.*y/len(df)), (x.mean(),y), ha='center', va='bottom')

In [None]:
for i in df.select_dtypes(exclude=object).columns:
    print('bivariate bargraph for', i)
    df.groupby('Outcome')[i].mean().plot(kind='bar')
    plt.ylabel(i)
    plt.show()

In [None]:
sns.pairplot(df)

In [None]:
sns.pairplot(df, hue='Outcome')

 we see strong correlation between BMI and skinthickness and then glucose and insulin which we may have to treat by dropping one in each pair after consulting the domain expert 

In [None]:
sns.heatmap(df.corr(), annot=True, cmap='bone', vmax=+1,vmin=-1,mask=np.triu(df.corr()))

** EDA observations -- it is a must  point all the analysis here 
1. univariate --- observations 
2.
3.

-----


# Preprocess the data for modelling 

In [None]:
#-- sep features and label 

X=df.drop('Outcome', axis=1)
y=df['Outcome']

In [None]:
# solve for data imbalance

sm=SMOTE()
X,y=sm.fit_resample(X,y)    # check with the client before it.  for reducing bias and variance we have to do this. 



In [None]:
y.value_counts()

## split the data

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101,stratify=y)

In [None]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [None]:
# can dio standard scaling also here 

# Modelling 

In [None]:
def print_metrics(y_test,y_pred,model_name):
    print('metrics for model', model_name)
    print("")
    print('accuracy=',accuracy_score(y_test,y_pred))
    print('')
    print('Recall=', recall_score(y_test,y_pred))
    print('')
    print('Precision=',precision_score(y_test,y_pred))
    print('')
    print('ROC score=', roc_auc_score(y_test,y_pred))
    print('')
    print('f1 score=', f1_score(y_test,y_pred))
    print("")
    print(confusion_matrix(y_test,y_pred))
    print('')
    print(classification_report(y_test,y_pred))
    

# knn model 

In [None]:
knn=KNeighborsClassifier()

In [None]:
knn.fit(x_train,y_train)

In [None]:
y_pred=knn.predict(x_test)

In [None]:
print_metrics(y_test,y_pred,'knn')

In [None]:
# fit all the models together 

classifiers ={'logreg':LogisticRegression(),
              'naive bayes':GaussianNB(),
              'decisiontree':DecisionTreeClassifier(),
              'randomforest':RandomForestClassifier(),
              'gradientboost':GradientBoostingClassifier(),
              'svm':SVC(),
             "xgboost":XGBClassifier()}
    
    
    
model_report=pd.DataFrame(columns=['model_name','accuracy','recall','precision','f1','roc'])

for classifier, classifier_name in list(zip(classifiers.values(), classifiers.keys())):
    classifier.fit(x_train,y_train)
    y_pred=classifier.predict(x_test)
    print('fitting the model ....',classifier_name)
    t=pd.Series({
        'model_name':classifier_name, 
        'accuracy':accuracy_score(y_test,y_pred),
        'recall':recall_score(y_test,y_pred),
        'precision':precision_score(y_test,y_pred),
        'f1':f1_score(y_test,y_pred),
        'roc':roc_auc_score(y_test,y_pred)
    })
    model_report=model_report.append(t,ignore_index=True)
    
model_report=model_report.sort_values(by='f1', ascending=False)

In [None]:
model_report

In [None]:
random forest has performed best 

In [None]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)

In [None]:
param_dist={'n_estimators':range(100,1000,100),
            'max_depth':range(10,100,5),
            'min_samples_leaf':range(1,10,1),
            'min_samples_split':range(2,20,2),
            'max_features':['log2','sqrt'],
            'criterion':['entropy','gini']}
n_folds=10
cv=RandomizedSearchCV(estimator=rfc,param_distributions=param_dist,n_jobs=-1,verbose=3,
                      cv=n_folds,scoring='f1',return_train_score=True,n_iter=10)
cv.fit(x_train,y_train)

In [None]:
cv.best_score_

In [None]:
cv.best_estimator_

In [None]:
# try with gridsearch 

In [None]:
import shap

In [None]:
best_rfc=cv.best_estimator_
value=shap.TreeExplainer(best_rfc).shap_values(x_test)
shap.summary_plot(value,x_train,plot_type='bar',feature_names=X.columns)


the interpretation also shows that glucose,, BMI = and age are stronger predictors of diabetics. 

In [None]:
param_dist={'n_estimators':range(100,1000,100),
            'max_depth':range(10,100,5),
            'min_samples_leaf':range(1,10,1),
            'min_samples_split':range(2,20,2),
            'max_features':['log2','sqrt'],
            'criterion':['entropy','gini']}
n_folds=10
#cv=GridSearchCV(estimator=rfc,param_grid==params,n_jobs=-1,verbose=3,
                      # cv=n_folds,scoring='f1',return_train_score=True)
cv = GridSearchCV(estimator=rfc, param_grid=param_dist, n_jobs=-1, verbose=3,
                  cv=n_folds, scoring='f1', return_train_score=True)
cv.fit(x_train,y_train)