# End to End Model Development and Deployment

In [None]:
# data manipulation and EDA libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# data preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from imblearn.over_sampling import SMOTE

# data modelling libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# data metrics 
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score,roc_curve
from sklearn.metrics import classification_report,confusion_matrix

# Model interpretation and deployment libraries
import shap
import pickle
from sklearn.pipeline import Pipeline
import streamlit as st
print("All libraries are imported")

In [None]:
data=pd.read_csv("diabetes.csv")
data.head()

# Step2 : Data Sanity check
- Get the basic info of the data.
- Look for null values
- Look for corrupted data
- Get the data summary statistics (both numerical and categorical)
- Look for erroneous values in the data

In [None]:
# get the shape of the data
data_shape=data.shape
print("Rows =",data_shape[0],"\nColumns =",data_shape[1])

In [None]:
# get the basic info
info=data.info()

# get the data type
dtype=data.dtypes
info,dtype

In [None]:
# Check for unique levels in categorical
data.Outcome.nunique()

In [None]:
# Get the value counts for outcome
data["Outcome"].value_counts()

In [None]:
# check for nulls and duplicates
nulls=data.isnull().sum()
dups=data.duplicated().sum()
nulls,dups

In [None]:
# Look for corrupt characters in the data
data[~data.applymap(np.isreal).any(1)]

In [None]:
# Summary statistics of numerical and categorical data
num_stats=data.describe().T
cat_stats=data.describe(include="O").T
print(num_stats)
print(cat_stats)

**Data Summary**
1. The dataset has 768 rows and 9 columns
2. The dataset has 8 numerical variables(int 64 and float 64) and one categorical variable 
3. **The categorical variable outcome has 4 levels which needs to be cleaned and be reduced to 2 levels (Yes-1/No-0)**
4. There are no missing values or duplicates
5. There are no corrupt characters
6. **There are many columns which have minimum value as 0 ie., physiologically not feasible, so we have to impute them with      column medians**.

## Step 3: Data Cleaning Step
- encode coategorical outcome variable
- impute columns with minimum value 0

In [None]:
# Create a copy of the data
df=data.copy()

In [None]:
cols=[ 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
zerofill=lambda x:x.replace(0,x.median())
df[cols]=df[cols].apply(zerofill,0)

In [None]:
# Check for minimum values
df[df.columns[:]].agg("min")

In [None]:
# categorical encoding
d={"Yes":1,"Tested_Positive":1,"No":0,"Tested_Negative":0}
df["Outcome"]=df["Outcome"].map(d)
df["Outcome"].value_counts()

In [None]:
# Step 4: Exloratory data analysis correlation matrix and heatmap
df.hist()
plt.tight_layout()
plt.show()

In [None]:
# Create individual box plots and histplots
def histplot_boxplot(data, feature, figsize=(12, 7), bins=None):
    print('Univariate for ...', feature)
    fig, (ax_box, ax_hist) = plt.subplots(nrows=2, sharex=True, figsize=figsize)
    
    sns.boxplot(data=data, x=feature, color='violet', ax=ax_box, showmeans=True)
    sns.histplot(data=data, x=feature, ax=ax_hist, bins=bins) if bins else sns.histplot(data=data, x=feature, ax=ax_hist)
    plt.axvline(data[feature].mean(), color='green', linestyle='--')  # Use mean instead of data[feature]
    plt.axvline(data[feature].median(), color='black', linestyle='-')
    plt.show()

In [None]:
# Assuming df is your DataFrame, iterate through numeric columns
for col in df.select_dtypes(exclude='O').columns:
    histplot_boxplot(data=df, feature=col)

In [None]:
num_outliers={}
for col in df.columns:
    q1=df[col].quantile(0.25)
    q3=df[col].quantile(0.75)
    iqr=q3-q1
    outliers=((df[col]<(q1-1.5*iqr))|(df[col]>(q3+1.5*iqr)))
    num_outliers[col]=outliers.sum()
num_outliers

In [None]:
# univariate barchart for categorical outcome
plt.figure(figsize=(12,7))
ax=sns.countplot(df["Outcome"],color="orange")
for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate("{:.2g}%".format(y*100/len(df)),(x.mean(),y),ha="center",va="bottom")
plt.title("Univariate Bar Chart for Outcome")
plt.show()

In [None]:
df.columns

In [None]:
cols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
for col in cols:
    print("Bivariates between outcome and {}".format(col))
    df.groupby("Outcome")[col].mean().plot(kind="bar",color="orange")
    plt.ylabel(col)
    plt.show()

In [None]:
sns.pairplot(df,hue="Outcome")
plt.show()

In [None]:
sns.heatmap(df.corr(),annot=True,cmap="Spectral",vmax=+1,vmin=-1)

**Observations**
1. Pregnancies,insulin,DiabetesPedigreeFunction and age are right skewed.
2. BloodPressure, Insulin, SkinThickness, DiabetesPredigree function had many oultliers.
3. Outliers counts have been obtained but we will not resolve these outliers.
4. The Outcome variable is highly imbalanced with 65% having 0 and 35% having 1.
5. Women with higher Pregnencies,Glucose,BMI,Age,DiabetesPedigree Function are more prone to diabetes. To confirm this we will use pairplots and heatmaps.
6. Based on KDE plots the distribution of Pregnencies,Glucose, Age, Diabetes Pedigree Function are much different for the two outcome classes showing that they are risk factors of diabetes.
7. Scatter plot shows string positive trend between glucose and Insulin, Glucose and BMI, Glucose and age. These may be risk factors of diabetes. We confirm with a heatmap.
8. Heatmap shows that Glucose,BMI and Age are risk factors of Diabetes.

### Step 5: Data Preprocessing
- Seperate features and label
- Do the label encoding 
- Solve for Data_imbalance
- Train_test_split
- Feature Scaling

In [None]:
def process(data,label):
    # Seperate the features and label
    X=df.drop("Outcome",axis=1)
    y=df["Outcome"]
    # Solve data imbalance
    sm=SMOTE()
    X,y=sm.fit_resample(X,y)
    # train test split
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y) # Stratify will maintain the ratio of 0 and 1 in train and test
    return x_train,x_test,y_train,y_test

In [None]:
x_train,x_test,y_train,y_test=process(df,label="Outcome")

In [None]:
# Scale the features
sc=StandardScaler()
x_train=sc.fit_transform(x_train) # fit is to get mean and std from the data
                                  # transform to use that mean and std on the data
                                  # only transform is used in x_test so that it used x_train mean and std to transform and not test
x_test=sc.transform(x_test)        

**We have preprocessed the data**

### Step 6: Fit and Evaluate ML Algorithms

In [None]:
# create a metrics function
def print_metrics(y_test,y_pred,model_name):
    print("Metrics for model...",model_name)
    print(" ")
    print("Accuracy Score=",accuracy_score(y_test,y_pred))
    print(" ")
    print("Recall Score=",recall_score(y_test,y_pred))
    print(" ")
    print("Precision Score=",precision_score(y_test,y_pred))
    print(" ")
    print("f1 Score=",f1_score(y_test,y_pred))
    print(" ")
    print("ROC AUC Score=",roc_auc_score(y_test,y_pred))
    print(" ")
    print("Confusion Matrix")
    print(confusion_matrix(y_test,y_pred))
    print(" ")
    print("Classification Report")
    print(classification_report(y_test,y_pred))

In [None]:
# Lets print and evaluate a KNN model
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
print_metrics(y_test,y_pred,"KNN")

In [None]:
plt.style.use('fivethirtyeight') # fivethirtyeight is the website used during presidential elections in USA

In [None]:
# Lets optimize the neighbours to improve by drawing model complexity curves
neighbors=np.arange(1,12)
train_accuracies=np.empty(len(neighbors))
test_accuracies=np.empty(len(neighbors))

#enumerate over the neighbors
for i,k in enumerate(neighbors):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train,y_train)
    train_accuracies[i]=knn.score(x_train,y_train)
    test_accuracies[i]=knn.score(x_test,y_test)

# Plot the model complexity curves
plt.plot(neighbors,train_accuracies,label="Training Metrics")
plt.plot(neighbors,test_accuracies,label="Test Metrics")
plt.legend()
plt.title("Model Complexity Curves")
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")
plt.show()

In [None]:
# Refit KNN with k=9
knn=KNeighborsClassifier(n_neighbors=9)
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
print_metrics(y_test,y_pred,"KNN")

In [None]:
# Fit all models to get the best model to optimize
clfs={"logreg":LogisticRegression(),
    "knn":KNeighborsClassifier(),
     "naive bayes":GaussianNB(),
     "decision tree":DecisionTreeClassifier(),
     "rfc":RandomForestClassifier(),
     "ABC":AdaBoostClassifier(),
     "GBC":GradientBoostingClassifier(),
     "SVM":SVC(),
     "XGB":XGBClassifier()}
models_report=pd.DataFrame(columns=["Model Name","Accuracy","Recall","Precision","F1 Score"])
for clf,clf_name in list(zip(clfs.values(),clfs.keys())):
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    print("Fitting the model ...",clf_name)
    t=pd.Series({"Model Name":clf_name,
                 "Accuracy":accuracy_score(y_test,y_pred),
                 "Recall":recall_score(y_test,y_pred),
                 "Precision":precision_score(y_test,y_pred),
                 "F1 Score":f1_score(y_test,y_pred)})
    models_report=models_report.append(t,ignore_index=True)
models_report=models_report.sort_values(by="F1 Score",ascending=False)
print(models_report)

### Step 7: Model Optimizations

**Random Forest came out to be the best and we will optimize it.**

In [None]:
# fit a RFC to the data 
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
print_metrics(y_test,y_pred,"RFC")

In [None]:
# check for overfitting 
train_acc=rfc.score(x_train,y_train)
test_acc=rfc.score(x_test,y_test)
train_acc,test_acc

There appears to be overfitting which need to be considered and solved this could be because we balanced the data.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
%%time
param_dict={"n_estimators":range(100,1000,50),
           "min_samples_leaf":range(1,5),
           "min_samples_split":range(2,10,2),
           "max_depth":range(10,100,5),
           "max_features":["auto","sqrt","log2"],
           "criterion":["gini","entropy"]}
n_folds=10
rs=RandomizedSearchCV(estimator=rfc,param_distributions=param_dict,scoring="f1",random_state=42,n_jobs=-1,n_iter=100,cv=n_folds)
rs.fit(x_train,y_train)

In [None]:
rs.best_score_

In [None]:
rs.best_params_

In [None]:
rs.best_estimator_

In [None]:
# check for overfitting 
train_acc=rs.score(x_train,y_train)
test_acc=rs.score(x_test,y_test)
train_acc,test_acc

In [None]:
# tuned rfc object
rfc_tuned=RandomForestClassifier(criterion='entropy', max_depth=70, max_features='log2',
                       min_samples_split=4, n_estimators=950)
rfc_tuned.fit(x_train,y_train)
y_pred=rfc_tuned.predict(x_test)
print_metrics(y_test,y_pred,"rfc_tuned")

In [None]:
train_acc=rfc_tuned.score(x_train,y_train)
test_acc=rfc_tuned.score(x_test,y_test)
train_acc,test_acc

### Step 9: Prepare for deployment by creating a pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
sc=StandardScaler()
model=RandomForestClassifier(criterion='entropy', max_depth=70, max_features='log2',
                       min_samples_split=4, n_estimators=950)
steps=[("scaler",sc),("model",model)]
pipeline=Pipeline(steps) 
x_train,x_test,y_train,y_test=process(df,label="Outcome")
pipeline.fit(x_train,y_train)
y_pred=pipeline.predict(x_test)
print_metrics(y_test,y_pred,"Pipeline")