### Defined by User

In [None]:
%store -r
## Number of times the script will push data
DATASET_SAMPLES  = 6

In [None]:
MONITOR_NAME = cld_config['MONITOR_NAME']
DKUBEUSERNAME = cld_config['SERVING_DKUBE_USERNAME']
TOKEN = cld_config['SERVING_DKUBE_TOKEN']
DKUBE_URL = cld_config['SERVING_DKUBE_URL']
MINIO_KEY = cld_config['MINIO_KEY']
MINIO_SECRET_KEY = cld_config['MINIO_SECRET_KEY']
MINIO_ENDPOINT = cld_config['MINIO_ENDPOINT']
RUN_FREQUENCY = cld_config['RUN_FREQUENCY']
INFERENCE_URL = cld_config['INFERENCE_URL']
DEPLOYMENT_ID = cld_config['SERVING_DEPLOYMENT_ID']
MINIO_BUCKET = cld_config['MINIO_BUCKET']

In [None]:
FREQUENCY = f"{RUN_FREQUENCY}m"

### DATA GENERATION

In [None]:
import os

HOME = os.getenv("HOME")
if HOME:
    EXECUTABLE_DIR = os.path.join(HOME,".local", "bin")
    PATH = os.getenv("PATH")
    if EXECUTABLE_DIR not in PATH:
        os.environ["PATH"] = f"{PATH}:{EXECUTABLE_DIR}"
    PATH = os.getenv("PATH")

In [None]:
import sys
job_class = os.getenv("DKUBE_JOB_CLASS")
if not job_class:
    !{sys.executable} -m pip install boto3 --user

In [None]:
if HOME:
    USR_LOCAL_LIB_PATH = os.path.join(HOME,".local","lib","python3.6","site-packages")
    if USR_LOCAL_LIB_PATH not in sys.path:
        sys.path.append(USR_LOCAL_LIB_PATH)

In [None]:
import io
import re
import time
import uuid
import random
import boto3
import joblib

import requests
import json, os

from dkube.sdk.api import DkubeApi

## Dependencies for data generator 
import numpy as np
import pandas as pd
from configparser import ConfigParser
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn import preprocessing as skpreprocessing
import datetime
from tqdm import tqdm_notebook as tqdm

from enum import Enum
from collections import namedtuple
from joblib import load
from urllib.parse import quote_plus
from sqlalchemy import create_engine

In [None]:
REFERENCE_DATA_S3_PATH = "https://dkube-examples-data.s3.us-west-2.amazonaws.com/monitoring-insurance/training-data/insurance.csv"

MODEL_FREQUENCY = RUN_FREQUENCY

In [None]:
class InsuranceDataGenerator:
    BUCKET = None
    S3_CLIENT = None
    DB_ENGINE = None
    API_CLIENT = None
    TOKEN = None
    USERNAME = None
    INFERENCE_URL = None

    def __init__(
        self,
        data_reference_s3path,
        monitor_name,
        n_predict_datasets: int = 1,
        n_groundtruth_datasets: int = 1,
        n_drift_datasets: int = 1,
        start_time: datetime.datetime = None,
        frequency="1H",
        model_frequency=10,
        duration: str = "10:24:12",
        margin=180,
    ):
        if not re.fullmatch("^\d+[hmHM]{1}$",frequency):
            raise ValueError("frequency can have [time_value_int][time_unit] time_unit can be case case insensitive out of H, M")
        self.n_predict_datasets = n_predict_datasets
        self.frequency  = frequency
        self.margin=margin
        self.monitor_name = monitor_name
        self.data_reference_s3path = data_reference_s3path
        self.__reference_df = None
        self.n_groundtruth_datasets = n_drift_datasets
        self.n_drift_datasets = n_drift_datasets
        self.start_time = start_time if start_time else datetime.datetime.utcnow()
        self.drift_seeds = [230,733, 881, 1163, 1456, 1499, 1631, 2084, 2255, 2968, 3541, 3688]
        self.input_features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
        self.model_frequency = model_frequency
        self.predict_start = datetime.datetime.utcnow()
        self.drift_start = datetime.datetime.utcnow()
        self.train_data = None
        self.sex_values = None
        self.children_values = None
        self.region_values = None
        self.age_min = self.age_max = None
        self.bmi_min = self.bmi_max = None
        self.model = None
        self.init_train_data()
        
        if n_groundtruth_datasets > n_predict_datasets or n_drift_datasets > n_predict_datasets:
            raise Exception("GroundTruth datasets or drift_datsets cant be greater than predict datasets")
            
        self.duration = duration
        klass = type(self)
        if not klass.BUCKET:
            klass.BUCKET = MINIO_BUCKET
        if not klass.S3_CLIENT:
            klass.S3_CLIENT = boto3.client("s3", aws_access_key_id=MINIO_KEY,
                                                 aws_secret_access_key=MINIO_SECRET_KEY,
                                                 endpoint_url = MINIO_ENDPOINT)
        if not klass.TOKEN:
            klass.TOKEN = os.getenv("DKUBE_USER_ACCESS_TOKEN",TOKEN)
        if not klass.USERNAME:
            klass.USERNAME= DKUBEUSERNAME
        if not klass.API_CLIENT:
            klass.API_CLIENT = DkubeApi(URL=os.getenv('DKUBE_URL',DKUBE_URL),token=klass.TOKEN)

        duration = self.duration.split("-")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
    
    def init_train_data(self):
        insurance = pd.read_csv("https://storage.googleapis.com/insurance-data/insurance/insurance.csv")
        for col in ['sex', 'smoker', 'region']:
            if (insurance[col].dtype == 'object'):
                le = preprocessing.LabelEncoder()
                le = le.fit(insurance[col])
                insurance[col] = le.transform(insurance[col])
        self.train_data = insurance
        self.sex_values = insurance["sex"].unique().tolist()
        self.children_values = insurance["children"].unique().tolist()
        self.smoker_values = insurance["smoker"].unique().tolist()
        self.region_values = insurance["region"].unique().tolist()
        self.age_min, self.age_max = insurance["age"].min(), insurance["age"].max()
        self.bmi_min, self.bmi_max = insurance["bmi"].min(), insurance["bmi"].max()
        
    def save_dataset(self, data, data_name:str, s3_prefix):
        klass = type(self)
        return klass.save_dataset_to_s3(data, data_name, s3_prefix)
    
    @classmethod
    def save_dataset_to_s3(cls, data, name, s3_prefix):
        file_name = name + ".csv"
        file_path = os.path.join(s3_prefix, file_name)
        with io.StringIO() as csv_buffer:
            data.to_csv(csv_buffer, index=False)

            response = cls.S3_CLIENT.put_object(
                Bucket=cls.BUCKET, Key=file_path, Body=csv_buffer.getvalue()
            )
            status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

            if status == 200:
                print(f"Successful S3 put_object response. Status - {status}")
                return file_path
            else:
                print(f"Unsuccessful S3 put_object response. Status - {status}")
                    
    @property
    def frequency_ts(self):
        value = int(self.frequency[:-1])
        unit = self.frequency[-1].lower()
        seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
        seconds_count = int(value) * seconds_per_unit[unit]
        now = datetime.datetime.utcnow()
        if unit.lower() == "h":
            delta = datetime.timedelta(hours=value)
            new_time = (now+delta).replace(minute = 0, second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
            second_remaining = (new_time-now).seconds
            result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
            print(f"Next Push after {datetime.timedelta(seconds=result)}")
            return result        
        elif unit == "m":
            diff = abs(now.minute%-value)
            if diff == 0:
                delta = datetime.timedelta(minutes=value)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                result = (new_time-now).seconds
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
            else:
                delta = datetime.timedelta(minutes = diff)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                if new_time < now:
                    new_time = new_time + datetime.timedelta(minutes=value)
                second_remaining = (new_time-now).seconds
                result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
        
    @property
    def end(self):
        duration = self.duration.split(":")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
        return self.start_time + datetime.timedelta(
            hours=int(duration[0]), minutes=int(duration[1]), seconds=int(duration[2])
        )

        
    @property
    def reference_df(self):
        if self.__reference_df == None:
            self.__reference_df = self.get_df_from_s3(self.data_reference_s3path)
        return self.__reference_df

    @classmethod
    def get_df_from_s3(cls, path):
        response = cls.S3_CLIENT.get_object(Bucket=cls.BUCKET, Key=path)
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
        if status == 200:
            print(f"Successful S3 get_object response. Status - {status}")
            data = pd.read_csv(response.get("Body"))
            return data
        else:
            print(f"Unsuccessful S3 get_object response. Status - {status}")



    @property
    def train_df(self):
        data = pd.read_csv(REFERENCE_DATA_S3_PATH)
        train = data.drop(["charges"], axis=1)
        y = data["charges"]
        a = np.arange(0, train.shape[1])
        train_aug = pd.DataFrame(
            index=train.index, columns=train.columns, dtype="float64"
        )

        for i in tqdm(range(0, len(train))):
            AUG_FEATURE_RATIO = 0.5
            AUG_FEATURE_COUNT = np.floor(train.shape[1] * AUG_FEATURE_RATIO).astype(
                "int16"
            )

            aug_feature_index = np.random.choice(
                train.shape[1], AUG_FEATURE_COUNT, replace=False
            )
            aug_feature_index.sort()

            feature_index = np.where(np.logical_not(np.in1d(a, aug_feature_index)))[0]

            train_aug.iloc[i, feature_index] = train.iloc[i, feature_index]

            rand_row_index = np.random.choice(
                len(train), len(aug_feature_index), replace=True
            )

            for n, j in enumerate(aug_feature_index):
                train_aug.iloc[i, j] = train.iloc[rand_row_index[n], j]

        train_aug["charges"] = y + y * 0.03
        train_all = pd.concat([data, train_aug])
        return train_all
    
    
    def train_test_split(self, test_size = 0.1):
        train_all = self.train_df
        train_dataset, predict_data = train_test_split(train_all, test_size=test_size,random_state=self.n_predict_datasets)
        train_dataset = train_dataset.reset_index(drop=True)

        for i in range(0, len(train_dataset)):
            train_dataset.loc[i, "unique_id"] = uuid.uuid4()

        for dataframe in [train_dataset, predict_data]:
            for col in ["sex", "smoker", "region"]:
                if dataframe[col].dtype == "object":
                    le = skpreprocessing.LabelEncoder()
                    le = le.fit(dataframe[col])
                    dataframe[col] = le.transform(dataframe[col])
                    print("Completed Label encoding on", col)

        predict_data = predict_data.reset_index(drop=True)
        for i in range(0, len(predict_data)):
            predict_data.loc[i, "unique_id"] = uuid.uuid4()
        return train_dataset, predict_data
    
    def sample_predict_data(self, predict_data = None,train_test_split=0.1):
        if predict_data is None:
            _,predict_data = self.train_test_split(train_test_split)
        n_predict_rows = predict_data.shape[0] // self.n_predict_datasets
        index = 0
        for i in range(1, self.n_predict_datasets + 1):
            pred_data = predict_data.iloc[index : index + n_predict_rows, :]
            pred_data_name = str(i) + "_predict_data"
            index += n_predict_rows
            yield {"name": pred_data_name, "df": pred_data}        

    def generate_all_predict(self): # All features will not have drift
        for i in range(1, self.n_predict_datasets + 1):
            no_of_samples = np.random.randint(80,100)
            predict_df = self.train_data.sample(no_of_samples)
            start = self.predict_start
            end = start + datetime.timedelta(minutes=self.model_frequency)
            predict_df["unique_id"] = [str(uuid.uuid4()) for i in range(no_of_samples)]
            predict_df["timestamp"] = pd.date_range(start, end, no_of_samples)
            pred_data_name = str(i) + "_predict_data"
            yield {"name": pred_data_name, "df": predict_df}
    
    def generate_random_drift(self): 
        state = np.random.get_state()
        for i in range(1,self.n_drift_datasets):
            seed = np.random.choice(self.drift_seeds)
            np.random.seed(seed)
            start = self.predict_start
            end = start + datetime.timedelta(minutes=self.model_frequency)
            no_of_samples = np.random.randint(80,100)
            predict_df = pd.DataFrame({
                'age' : np.random.randint(self.age_min +10,self.age_max,no_of_samples).tolist(),
                'sex' : np.random.choice(self.sex_values, no_of_samples).tolist(),
                'bmi' : np.random.uniform(self.bmi_min, self.bmi_max,no_of_samples).tolist(),
                'children' : np.random.choice(self.children_values, no_of_samples).tolist(),
                'smoker' : np.random.choice(self.smoker_values, no_of_samples).tolist(),
                'region' : np.random.choice(self.region_values, no_of_samples).tolist(),
                'unique_id' : [str(uuid.uuid4()) for i in range(no_of_samples)],
                'timestamp': pd.date_range(start, end, no_of_samples)
            })
            predict_df["charges"] = np.random.uniform(5000,10000,)
            drift_data_name = str(i) + "_drifted_data"
            yield {"name": drift_data_name, "df": predict_df}   
        np.random.set_state(state)
        
    def predict_from_endpoint(self, sample_df):
        payload = {
            'signatures':{
                'inputs':[[{'data':sample_df[self.input_features].to_csv(index=False)}]]
            },
            'instances': [],
            'token': self.TOKEN
        }
        r = requests.post(self.INFERENCE_URL, json=payload, 
                            headers = {'authorization': "Bearer " + self.TOKEN}, 
                            verify = False)
        predictions = r.json()["result"]
        predictions = [p[-1] for p in predictions]
        start = datetime.datetime.utcnow()
        end = start + datetime.timedelta(seconds=10)
        timestamps = pd.date_range(start, end, len(predictions))
        labels = sample_df["charges"][:len(predictions)]
        labelled_df = pd.DataFrame({
            "timestamp": timestamps,
            "charges": predictions,
            "GT_target": labels
        })
        return labelled_df
    
    def generate_drift_datasets(self, predict_data = None,train_test_split=0.1):
        for j, data in enumerate(self.sample_predict_data(predict_data,train_test_split)):
            drifted_data = data["df"]
            if j % 2 == 0:
                for i in range(0, len(drifted_data)):
                    rndm = random.randint(0,2)
                    if rndm == 0:
                        drifted_data["age"].iloc[i] = drifted_data["age"].iloc[i] + random.randint(15,80)
                    elif rndm == 1:
                        drifted_data["bmi"].iloc[i] = drifted_data["bmi"].iloc[i] + random.randint(15,50)
                    else:
                        drifted_data["age"].iloc[i] = drifted_data["age"].iloc[i] + random.randint(15,80)
                        drifted_data["bmi"].iloc[i] = drifted_data["bmi"].iloc[i] + random.randint(15,50)
            else:
                random_rows_count = random.randint(0, len(drifted_data)-1)
                region = ["southeast", "northwest"]
                sex = ["male","female"]
                for i in range(random_rows_count):
                    random_index = random.randint(0,len(drifted_data)-1)
                    random_gender_idx = random.randint(0,1)
                    drifted_data["sex"] = sex[random_gender_idx]
                    drifted_data["sex"].iloc[random_index] = sex[0 if random_gender_idx else 0]
                    random_index = random.randint(0,len(drifted_data)-1)
                    random_region_index = random.randint(0,1)
                    drifted_data["region"] = region[random_region_index]
                    drifted_data["region"].iloc[random_index] = region[0 if random_gender_idx else 0]
            drifted_name = str(j+1) + "_drifted_data"
            if j > self.n_drift_datasets:
                return
            yield {"name": drifted_name, "df": drifted_data}

In [None]:
InsuranceDataGenerator.URL = DKUBE_URL
InsuranceDataGenerator.TOKEN = TOKEN
InsuranceDataGenerator.API_CLIENT = DkubeApi(URL=DKUBE_URL, token=TOKEN)
if INFERENCE_URL is not None:
    InsuranceDataGenerator.INFERENCE_URL = INFERENCE_URL
else:
     raise "INFERENCE_URL is Empty, Provide value for variable INFERENCE_URL"

In [None]:
generator = InsuranceDataGenerator(REFERENCE_DATA_S3_PATH,
                                   MONITOR_NAME,
                                   DATASET_SAMPLES,
                                   DATASET_SAMPLES,
                                   DATASET_SAMPLES,
                                   frequency=FREQUENCY,
                                   model_frequency = MODEL_FREQUENCY)

train_predict = generator.train_test_split()
trainds, testds = train_predict
# prediction data with no drift
predict_samples = list(generator.generate_all_predict())
## drift in some features
drift_datasets = list(generator.generate_random_drift())

In [None]:
len(predict_samples), len(drift_datasets)

In [None]:
ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n//10%10!=1)*(n%10<4)*n%10::4])
push_count = 1
groundtruth_path = []
for i, data in enumerate(predict_samples):
    second_remaining = generator.frequency_ts
    current_time = datetime.datetime.utcnow()
    time.sleep(second_remaining)
    pushed_date = datetime.datetime.utcnow()
    sample_count = round(random.uniform(0.5 ,0.9),2)
    print("Generating data")
    if i%2:
        groundtruth_data = generator.predict_from_endpoint(data["df"])
        g_path = generator.save_dataset(groundtruth_data, data["name"], DEPLOYMENT_ID + "/livedata")
        if g_path:
            groundtruth_path.append(g_path)
            print(g_path)
    else:
        groundtruth_data = generator.predict_from_endpoint(drift_datasets[i]["df"])
        g_path = generator.save_dataset(groundtruth_data, drift_datasets[i]["name"], DEPLOYMENT_ID + "/livedata")
        if g_path:
            groundtruth_path.append(g_path)
            print(g_path)
    print(f"Pushed data for {ordinal(push_count)} time, Remaining pushes: {DATASET_SAMPLES-push_count}, Monitor name: {MONITOR_NAME}")
    push_count += 1
print("***************** DATA GENERATION COMPLETED ******************************")