In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data_path = "..//data//diabetes.csv"

df = pd.read_csv(data_path)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

X = df.drop("Outcome", axis = 1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = X.columns
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

In [3]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

In [4]:
from sklearn.metrics import classification_report

y_pred = pipeline.predict(X_test)

report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



In [5]:
import pickle

with open("..//models//pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

In [6]:
'''
import requests

url = "http://localhost:8000/predict"

data = {
    "data": [
        [1, 85, 66, 29, 0, 26.6, 0.351, 31]
    ]
}

response = requests.post(url, json=data)
print(response.json())
'''

'\nimport requests\n\nurl = "http://localhost:8000/predict"\n\ndata = {\n    "data": [\n        [1, 85, 66, 29, 0, 26.6, 0.351, 31]\n    ]\n}\n\nresponse = requests.post(url, json=data)\nprint(response.json())\n'

In [7]:
import numpy as np

def introduce_drift(data, drift_features, drift_amount=0.1, random_seed=42):
    np.random.seed(random_seed)
    drifted_data = data.copy()

    for feature in drift_features:
        if feature in data.columns:
            drifted_data[feature] += np.random.normal(loc = 0, scale = drift_amount, size = data.shape[0])
    return drifted_data

features_to_drift = ["Glucose", "BloodPressure", "SkinThickness", "Pregnancies"]

drifted_data = introduce_drift(X_test, features_to_drift, drift_amount=50)
drifted_data = drifted_data.reset_index(drop = True)


In [8]:
reference_data = X_train
reference_data["Outcome"] = y_train.reset_index(drop = True)
drifted_data["Outcome"] = y_test.reset_index(drop = True)

In [9]:
drifted_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,-28.986275,122.835708,72.653624,60.354869,190,34.0,0.43,43,0
1,12.698996,105.086785,39.282429,21.890367,0,35.7,0.148,21,0
2,-3.616402,140.384427,157.288726,-10.88406,0,30.8,0.158,21,0
3,-3.04848,183.151493,103.691646,54.938843,0,24.6,0.856,34,0
4,37.708335,124.292331,30.434825,41.270817,0,29.9,0.21,50,0


In [10]:
from evidently.metric_preset import DataDriftPreset
from evidently.report import Report

data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(current_data=drifted_data.drop('Outcome', axis =1), 
                      reference_data=reference_data.drop('Outcome', axis =1), 
                      column_mapping=None)
report_json = data_drift_report.as_dict()
drift_detected = report_json['metrics'][0]['result']['dataset_drift']

In [12]:
drift_detected

True

In [13]:
drifted_data.to_csv('..//data//new_data.csv', index=False)
reference_data.to_csv('..//data//reference_data.csv', index=False)