In [None]:
import os
HOME = os.getenv("HOME")
if HOME:
    EXECUTABLE_DIR = os.path.join(HOME,".local", "bin")
    PATH = os.getenv("PATH")
    if EXECUTABLE_DIR not in PATH:
        os.environ["PATH"] = f"{PATH}:{EXECUTABLE_DIR}"
    PATH = os.getenv("PATH")
if not os.getenv("AWS_BUCKET"):
    os.environ["AWS_BUCKET"] = "mm-workflow"


In [None]:
import sys
!{sys.executable} -m pip install kfp==1.4.0 kfp-server-api==1.2.0 --user &> /dev/null
!{sys.executable} -m pip install randomtimestamp --user
!{sys.executable} -m pip install pymysql --user
#!{sys.executable} -m pip install --upgrade pyodbc --user

In [None]:
if HOME:
    USR_LOCAL_LIB_PATH = os.path.join(HOME,".local","lib","python3.6","site-packages")
    if USR_LOCAL_LIB_PATH not in sys.path:
        sys.path.append(USR_LOCAL_LIB_PATH)

In [None]:
import json
import kfp
import kfp.dsl as dsl
import kfp.compiler as compiler
from kfp import components
from kubernetes import client as k8s_client
import os
import json
import kfp
import requests
import string
import random
import kfp.dsl as dsl
import kfp.compiler as compiler
from kfp import components
from kubernetes import client as k8s_client

from dkube.sdk.api import DkubeApi
from dkube.sdk.rsrcs import DkubeCode
from dkube.sdk.rsrcs import DkubeDataset
from dkube.sdk.rsrcs import DkubeModel


## Dependencies for data generator 
import numpy as np
import pandas as pd
from configparser import ConfigParser
import numpy.random,argparse,uuid
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import boto3
import time
import joblib
from randomtimestamp import randomtimestamp
from sklearn import preprocessing as skpreprocessing
import datetime
from tqdm import tqdm_notebook as tqdm
import io
import re
from enum import Enum
from collections import namedtuple
from joblib import dump, load
from sqlalchemy import create_engine
#import pyodbc

In [None]:
class DBCONFIG:
    def __init__(self, hostname, databasename, username, password):
        self.hostname = hostname
        self.databasename = databasename
        self.username = username
        self.password = password
    
    def __str__(self):
        return f"mysql+pymysql://{self.username}:{self.password}@{self.hostname}/{self.databasename}"
    def __repr__(self):
        return f"mysql+pymysql://{self.username}:{self.password}@{self.hostname}/{self.databasename}"

class DataSource(Enum):
    LOCAL = "local"
    AWS_S3 = "aws_s3"
    SQL = "sql"
    

DatasetSource = namedtuple('DatasetSource', 'model_monitor table frequency_unit data_class add_prefix_ts')

In [None]:
REFERENCE_DATA_S3_PATH = "insurance-base/insurance.csv"
MONITOR_NAME ="mm-demo"
FREQUENCY = "5m"
MODEL_FREQUENCY = 4

DATASET_SAMPLES  = 6

PREDICT_DATASET_TABLE   = "insurance_predict" 
LABELLED_DATASET_TABLE  = "insurance_gt"

PREDICT_DATA_CLASS = "predict" # used for s3 
LABELLED_DATA_CLASS = "groundtruth" #used for s3

PREFIX_PREDICT_DATASET_WITH_TS = True
PREFIX_LABELLED_DATASET_WITH_TS = False

DATASET_SOURCE = DataSource.AWS_S3

HOSTNAME = ""
DATABASE_NAME = ""
USERNAME = ""
PASSWORD = ""

# MODEL FOR PREDICTION
train_model = load('model.joblib')

In [None]:
def get_password():
    datum_name = "sql-data"
    user = os.getenv("DKUBE_USER_LOGIN_NAME")
    headers={"authorization": "Bearer "+os.getenv("DKUBE_USER_ACCESS_TOKEN")}
    url = "http://dkube-controller-worker.dkube:5000/dkube/v2/controller/users/%s/datums/class/dataset/datum/%s"
    resp = requests.get(url % (user, datum_name), headers=headers).json()
    return resp['data']['datum']['sql']['password']

In [None]:
class InsuranceDataGenerator:
    # With no parameters or configuration, boto3 will look for
    # access keys in these places:
    # 1. Environment variables (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY)
    # 2. Credentials file (~/.aws/credentials or
    #      C:\Users\USER_NAME\.aws\credentials)
    # 3. AWS IAM role for Amazon EC2 instance
    #    (http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html)

    #    Define a ~/.aws/credentials file as following
    #    [default]
    #    aws_access_key_id=foo
    #    aws_secret_access_key=bar
    #    aws_session_token=baz # might not be required
    BUCKET = None
    S3_CLIENT = None
    DB_ENGINE = None
    API_CLIENT = None
    TOKEN = None
    USERNAME = None

    def __init__(
        self,
        data_reference_s3path,
        monitor_name,
        n_predict_datasets: int = 1,
        n_groundtruth_datasets: int = 1,
        n_drift_datasets: int = 1,
        start_time: datetime.datetime = None,
        frequency="1H",
        model_frequency=10,
        duration: str = "10:24:12",
        margin=20,
        db_config:DBCONFIG = None,
        dataset_source: DataSource = DataSource.AWS_S3
    ):
        if not re.fullmatch("^\d+[hmHM]{1}$",frequency):
            raise ValueError("frequency can have [time_value_int][time_unit] time_unit can be case case insensitive out of H, M")
        self.n_predict_datasets = n_predict_datasets
        self.frequency  = frequency
        self.margin=margin
        self.monitor_name = monitor_name
        self.data_reference_s3path = data_reference_s3path
        self.__reference_df = None
        self.n_groundtruth_datasets = n_drift_datasets
        self.n_drift_datasets = n_drift_datasets
        self.dataset_source = dataset_source
        self.start_time = start_time if start_time else datetime.datetime.utcnow()
        self.drift_seeds = [572, 1968, 2254 ,2642 , 2864, 3164]
        self.input_features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
        self.model_frequency = model_frequency
        self.predict_start = datetime.datetime.utcnow()
        self.drift_start = datetime.datetime.utcnow()
        self.train_data = None
        self.sex_values = None
        self.children_values = None
        self.region_values = None
        self.age_min = self.age_max = None
        self.bmi_min = self.bmi_max = None
        self.model = None
        self.init_train_data()
        self.load_model()
        
        if n_groundtruth_datasets > n_predict_datasets or n_drift_datasets > n_predict_datasets:
            raise Exception("GroundTruth datasets or drift_datsets cant be greater than predict datasets")
        self.db_config = db_config
            
        self.duration = duration
        klass = type(self)
        if not klass.BUCKET:
            klass.BUCKET = os.getenv("AWS_BUCKET")
        if not klass.S3_CLIENT:
            klass.S3_CLIENT = boto3.client("s3")
        if not klass.TOKEN:
            klass.TOKEN = os.getenv("DKUBE_USER_ACCESS_TOKEN")
        if not klass.USERNAME:
            klass.USERNAME= os.getenv("USERNAME")
        if not klass.API_CLIENT:
            klass.API_CLIENT = DkubeApi(token=klass.TOKEN)
        if not klass.DB_ENGINE:
            if self.db_config:
                klass.DB_ENGINE = create_engine(str(self.db_config))

        duration = self.duration.split("-")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
    
    def init_train_data(self):
        insurance = pd.read_csv("https://storage.googleapis.com/insurance-data/insurance/insurance.csv")
        for col in ['sex', 'smoker', 'region']:
            if (insurance[col].dtype == 'object'):
                le = preprocessing.LabelEncoder()
                le = le.fit(insurance[col])
                insurance[col] = le.transform(insurance[col])
        self.train_data = insurance
        self.sex_values = insurance["sex"].unique().tolist()
        self.children_values = insurance["children"].unique().tolist()
        self.smoker_values = insurance["smoker"].unique().tolist()
        self.region_values = insurance["region"].unique().tolist()
        self.age_min, self.age_max = insurance["age"].min(), insurance["age"].max()
        self.bmi_min, self.bmi_max = insurance["bmi"].min(), insurance["bmi"].max()
        
    def load_model(self):
        self.model = joblib.load("model.joblib")
    
    @classmethod
    def save_dataset_to_s3(cls, data, monitor_name, name, typeofdata, prefix_dir_with_ts = True, frequency_unit="H",current_date=None):
        file_name = name + ".csv"
        if not current_date:
            current_date = datetime.datetime.now()
        data_dir = os.path.join(
            monitor_name,
            typeofdata
        )
        if prefix_dir_with_ts:
            data_dir = os.path.join(data_dir, 
            current_date.strftime("%Y"),
            current_date.strftime("%m"),
            current_date.strftime("%d"),
            current_date.strftime("%H"))
            if frequency_unit.lower() =="m":
                data_dir = os.path.join(data_dir,current_date.strftime("%M"))
        file_path = os.path.join(data_dir, file_name)
        with io.StringIO() as csv_buffer:
            data.to_csv(csv_buffer, index=False)

            response = cls.S3_CLIENT.put_object(
                Bucket=cls.BUCKET, Key=file_path, Body=csv_buffer.getvalue()
            )
            status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

            if status == 200:
                print(f"Successful S3 put_object response. Status - {status}")
                return file_path
            else:
                print(f"Unsuccessful S3 put_object response. Status - {status}")
                
    @staticmethod
    def save_dataset_to_local(data, name, monitor_name,typeofdata, current_date=None):
        file_name = name + ".csv"
        try:
            data_dir = os.path.dirname(os.path.realpath(__file__))
        except:
            data_dir = os.getcwd()
        if not current_date:
            current_date = datetime.datetime.now()
        data_dir = os.path.join(
            data_dir,
            monitor_name,
            typeofdata,
            current_date.strftime("%Y"),
            current_date.strftime("%m"),
            current_date.strftime("%d"),
            current_date.strftime("%H"),
        )
        if not os.path.isdir(data_dir):
            os.makedirs(data_dir, exists_ok=True)
        file_path = os.path.join(data_dir, file_name)
        data.to_csv(file_path, index=False)       
        return file_path
    
    @classmethod
    def save_dataset_to_sql(cls, data, tablename):
        data.to_sql(tablename, cls.DB_ENGINE, if_exists="append", index=False)
    
    def save_dataset(self ,data, data_name:str, config: DatasetSource, current_date=None):
        klass = type(self)
        if self.dataset_source == DataSource.AWS_S3:
            return klass.save_dataset_to_s3(data, config.model_monitor, data_name, config.data_class, config.add_prefix_ts, config.frequency_unit, current_date)
        elif self.dataset_source == DataSource.SQL:
            klass.save_dataset_to_sql(data, config.table)
        elif self.dataset_source == DataSource.LOCAL:
            return klass.save_dataset_to_local(data, data_name, config.model_monitor, config.data_class, current_date)

    @property
    def frequency_ts(self):
        value = int(self.frequency[:-1])
        unit = self.frequency[-1].lower()
        seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
        seconds_count = int(value) * seconds_per_unit[unit]
        now = datetime.datetime.utcnow()
        if unit.lower() == "h":
            delta = datetime.timedelta(hours=value)
            new_time = (now+delta).replace(minute = 0, second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
            second_remaining = (new_time-now).seconds
            result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
            print(f"Next Push after {datetime.timedelta(seconds=result)}")
            return result        
        elif unit == "m":
            diff = abs(now.minute%-value)
            if diff == 0:
                delta = datetime.timedelta(minutes=value)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                result = (new_time-now).seconds
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
            else:
                delta = datetime.timedelta(minutes = diff)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                second_remaining = (new_time-now).seconds
                result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
        

    
    @property
    def awsS3Secret(self):
        AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID","") 
        AWS_SECRET_KEY = os.getenv("AWS_ACCESS_SECRET_KEY","")
        if AWS_ACCESS_KEY and AWS_SECRET_KEY:
            return {"access_key":AWS_ACCESS_KEY, "secret_key": AWS_SECRET_KEY}
        else:
            home_dir = os.getenv("HOME")
            if home_dir:
                creds_path = os.path.join(home_dir, ".aws","credentials")
                config = ConfigParser()
                if os.path.isfile(creds_path):
                    config.read(creds_path)
                    if "default" in config:
                        AWS_ACCESS_KEY = config["default"]["aws_access_key_id"]
                        AWS_SECRET_KEY = config["default"]["aws_secret_access_key"]
                        if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                            return {"access_key":AWS_ACCESS_KEY, "secret_key": AWS_SECRET_KEY}
                
    def create_aws_dkube_dataset(self, path,ds_class):
        klass = type(self)
        try:
            ds = klass.API_CLIENT.get_repo("dataset",klass.USERNAME,f"{self.monitor_name}-{ds_class}")
        except Exception  as e:
            ds = DkubeDataset(klass.USERNAME, f"{self.monitor_name}-{ds_class}", remote=True)
            ds.update_dataset_source('aws_s3')
            secret = self.awsS3Secret
            if secret:
                ds.update_awss3_details(klass.BUCKET,path, secret["access_key"],secret["secret_key"])
                klass.API_CLIENT.create_dataset(ds)
        else:
            print(f"{self.monitor_name}-{ds_class} dataset already existing")
        
    @property
    def end(self):
        duration = self.duration.split(":")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
        return self.start_time + datetime.timedelta(
            hours=int(duration[0]), minutes=int(duration[1]), seconds=int(duration[2])
        )

        
    @property
    def reference_df(self):
        if self.__reference_df == None:
            self.__reference_df = self.get_df_from_s3(self.data_reference_s3path)
        return self.__reference_df

    @classmethod
    def get_df_from_s3(cls, path):
        response = cls.S3_CLIENT.get_object(Bucket=cls.BUCKET, Key=path)
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
        if status == 200:
            print(f"Successful S3 get_object response. Status - {status}")
            data = pd.read_csv(response.get("Body"))
            return data
        else:
            print(f"Unsuccessful S3 get_object response. Status - {status}")



    @property
    def train_df(self):
        data = self.reference_df
        train = data.drop(["charges"], axis=1)
        y = data["charges"]
        a = np.arange(0, train.shape[1])
        train_aug = pd.DataFrame(
            index=train.index, columns=train.columns, dtype="float64"
        )

        for i in tqdm(range(0, len(train))):
            AUG_FEATURE_RATIO = 0.5
            AUG_FEATURE_COUNT = np.floor(train.shape[1] * AUG_FEATURE_RATIO).astype(
                "int16"
            )

            aug_feature_index = np.random.choice(
                train.shape[1], AUG_FEATURE_COUNT, replace=False
            )
            aug_feature_index.sort()

            feature_index = np.where(np.logical_not(np.in1d(a, aug_feature_index)))[0]

            train_aug.iloc[i, feature_index] = train.iloc[i, feature_index]

            rand_row_index = np.random.choice(
                len(train), len(aug_feature_index), replace=True
            )

            for n, j in enumerate(aug_feature_index):
                train_aug.iloc[i, j] = train.iloc[rand_row_index[n], j]

        train_aug["charges"] = y + y * 0.03
        train_all = pd.concat([data, train_aug])
        return train_all
    
    
    def train_test_split(self, test_size = 0.1):
        train_all = self.train_df
        train_dataset, predict_data = train_test_split(train_all, test_size=test_size,random_state=self.n_predict_datasets)
        train_dataset = train_dataset.reset_index(drop=True)

        for i in range(0, len(train_dataset)):
            train_dataset.loc[i, "unique_id"] = uuid.uuid4()

        for dataframe in [train_dataset, predict_data]:
            for col in ["sex", "smoker", "region"]:
                if dataframe[col].dtype == "object":
                    le = skpreprocessing.LabelEncoder()
                    le = le.fit(dataframe[col])
                    dataframe[col] = le.transform(dataframe[col])
                    print("Completed Label encoding on", col)

        predict_data = predict_data.reset_index(drop=True)
        for i in range(0, len(predict_data)):
            predict_data.loc[i, "unique_id"] = uuid.uuid4()
        return train_dataset, predict_data
    
    def sample_predict_data(self, predict_data = None,train_test_split=0.1):
        if predict_data is None:
            _,predict_data = self.train_test_split(train_test_split)
        n_predict_rows = predict_data.shape[0] // self.n_predict_datasets
        index = 0
        
        input_features = ["age","sex","bmi","region","children","smoker"]
        predict_data['charges'] = train_model.predict(predict_data[input_features])
        for i in range(1, self.n_predict_datasets + 1):
            pred_data = predict_data.iloc[index : index + n_predict_rows, :]
            pred_data_name = str(i) + "_predict_data"
            index += n_predict_rows
            yield {"name": pred_data_name, "df": pred_data}

    
    def generate_groundtruth_samples(self, predict_data = None,train_test_split=0.1):
        inp_features = ['age','sex','bmi','region','children','smoker']
        for i, data in enumerate(self.sample_predict_data(predict_data, train_test_split)):
            gt_data = data["df"]
            gt_data["GT_target"] = gt_data["charges"] + gt_data["charges"] * 0.05
            gt_data = gt_data.drop(["charges"], axis=1)
            gt_name = str(i+1) + "_GTpredict_data"
            if i > self.n_groundtruth_datasets:
                return
            yield {"name": gt_name, "df": gt_data}

    def generate_gt_samples(self,predict_data):
        for i, data in enumerate(predict_data):
            gt_data = data["df"]
            gt_data["GT_target"] = gt_data["charges"] + gt_data["charges"] * 0.05
            gt_data = gt_data.drop(["charges"], axis=1)
            gt_name = str(i+1) + "_GTpredict_data"
            if i > self.n_groundtruth_datasets:
                return
            yield {"name": gt_name, "df": gt_data}
            

    def generate_all_predict(self): # All features will not have drift

        inp_features = ['age','sex','bmi','region','children','smoker']
        
        for i in range(1, self.n_predict_datasets + 1):
            no_of_samples = np.random.randint(80,100)
            predict_df = self.train_data[self.input_features].sample(no_of_samples)
            predict_df["charges"] = self.model.predict(predict_df[self.input_features])
            start = self.predict_start
            end = start + datetime.timedelta(minutes=self.model_frequency)
            predict_df["uuid"] = [str(uuid.uuid4()) for i in range(no_of_samples)]
            predict_df["timestamp"] = pd.date_range(start, end, no_of_samples)
            pred_data_name = str(i) + "_predict_data"
            yield {"name": pred_data_name, "df": predict_df}
    

    def generate_all_drift(self): ## All features will have drift
        for i in range(1,self.n_drift_datasets):
            state = np.random.get_state()
            seed = np.random.choice(self.drift_seeds)
            np.random.seed(seed)
            start = self.drift_start
            end = start + datetime.timedelta(minutes=self.model_frequency)
            no_of_samples = np.random.randint(80,100)
            drift_df = pd.DataFrame({
                'age' : np.random.randint(self.age_max-15,self.age_max,no_of_samples).tolist(),
                'sex' : np.repeat([np.random.choice(self.sex_values)], no_of_samples).tolist(),
                'bmi' : np.random.uniform(self.bmi_min, self.bmi_max,no_of_samples).tolist(),
                'children' : np.random.choice(self.children_values, no_of_samples).tolist(),
                'smoker' : np.random.choice(self.smoker_values, no_of_samples).tolist(),
                'region' : np.random.choice(self.region_values, no_of_samples).tolist(),
                'uuid' : [str(uuid.uuid4()) for i in range(no_of_samples)],
                'timestamp': pd.date_range(start, end, no_of_samples)
            })
            drift_df["charges"] = self.model.predict(drift_df[self.input_features])
            np.random.set_state(state)
            drift_data_name = str(i) + "_drifted_data"
            yield {"name": drift_data_name, "df": drift_df}
            
    
    
    def generate_random_drift(self): # Some features will have drift
        for i in range(1,self.n_drift_datasets):
            start = self.predict_start
            end = start + datetime.timedelta(minutes=self.model_frequency)
            no_of_samples = np.random.randint(80,100)
            predict_df = pd.DataFrame({
                'age' : np.random.randint(self.age_min,self.age_max,no_of_samples).tolist(),
                'sex' : np.random.choice(self.sex_values, no_of_samples).tolist(),
                'bmi' : np.random.uniform(self.bmi_min, self.bmi_max,no_of_samples).tolist(),
                'children' : np.random.choice(self.children_values[:4], no_of_samples).tolist(),
                'smoker' : np.random.choice(self.smoker_values, no_of_samples).tolist(),
                'region' : np.random.choice(self.region_values, no_of_samples).tolist(),
                'uuid' : [str(uuid.uuid4()) for i in range(no_of_samples)],
                'timestamp': pd.date_range(start, end, no_of_samples)
            })
            predict_df["charges"] = self.model.predict(predict_df[self.input_features])
            drift_data_name = str(i) + "_drifted_data"
            yield {"name": drift_data_name, "df": predict_df}   
    
    def generate_drift_datasets(self, predict_data = None,train_test_split=0.1):
        for j, data in enumerate(self.sample_predict_data(predict_data,train_test_split)):
            drifted_data = data["df"]
            if j % 2 == 0:
                for i in range(0, len(drifted_data)):
                    rndm = random.randint(0,2)
                    if rndm == 0:
                        drifted_data["age"].iloc[i] = drifted_data["age"].iloc[i] + random.randint(15,80)
                    elif rndm == 1:
                        drifted_data["bmi"].iloc[i] = drifted_data["bmi"].iloc[i] + random.randint(15,50)
                    else:
                        drifted_data["age"].iloc[i] = drifted_data["age"].iloc[i] + random.randint(15,80)
                        drifted_data["bmi"].iloc[i] = drifted_data["bmi"].iloc[i] + random.randint(15,50)
            else:
                random_rows_count = random.randint(0, len(drifted_data)-1)
                region = ["southeast", "northwest"]
                sex = ["male","female"]
                for i in range(random_rows_count):
                    random_index = random.randint(0,len(drifted_data)-1)
                    random_gender_idx = random.randint(0,1)
                    drifted_data["sex"] = sex[random_gender_idx]
                    drifted_data["sex"].iloc[random_index] = sex[0 if random_gender_idx else 0]
                    random_index = random.randint(0,len(drifted_data)-1)
                    random_region_index = random.randint(0,1)
                    drifted_data["region"] = region[random_region_index]
                    drifted_data["region"].iloc[random_index] = region[0 if random_gender_idx else 0]
            drifted_name = str(j+1) + "_drifted_data"
            if j > self.n_drift_datasets:
                return
            yield {"name": drifted_name, "df": drifted_data}

In [None]:
generator = InsuranceDataGenerator(REFERENCE_DATA_S3_PATH,
                                   MONITOR_NAME,
                                   DATASET_SAMPLES,
                                   DATASET_SAMPLES,
                                   DATASET_SAMPLES,
                                   frequency=FREQUENCY,
                                   model_frequency = MODEL_FREQUENCY,
                                   db_config = DBCONFIG(
                                       hostname=HOSTNAME,
                                       databasename = DATABASE_NAME,
                                       username = USERNAME,
                                       password = PASSWORD),
                                   dataset_source = DATASET_SOURCE)

predict_dataset_source = DatasetSource(model_monitor=MONITOR_NAME,
                                       table=PREDICT_DATASET_TABLE, 
                                       frequency_unit = generator.frequency[-1],
                                       data_class=PREDICT_DATA_CLASS,
                                       add_prefix_ts=PREFIX_PREDICT_DATASET_WITH_TS)
ground_dataset_source = DatasetSource(model_monitor=MONITOR_NAME,
                                      table=LABELLED_DATASET_TABLE,
                                      data_class=LABELLED_DATA_CLASS,
                                      frequency_unit = generator.frequency[-1],
                                      add_prefix_ts=PREFIX_LABELLED_DATASET_WITH_TS )

train_predict = generator.train_test_split()
trainds, testds = train_predict
#predict_samples = list(generator.sample_predict_data(testds))
#groundtruth_samples= list(generator.generate_groundtruth_samples(testds))
#drift_datasets = list(generator.generate_drift_datasets(testds))

# prediction data with no drift
predict_samples = list(generator.generate_all_predict())
groundtruth_samples= list(generator.generate_gt_samples(predict_samples))

## Random drift or all drift
drift_datasets = list(np.random.choice([generator.generate_random_drift(), generator.generate_all_drift()]))


In [None]:
# Save Training Data
# training_path = InsuranceDataGenerator.save_dataset_to_s3(trainds, generator.monitor_name,"training","training",False)
# generator.create_aws_dkube_dataset(training_path,"training")
drift_path = []
predict_path = []
groundtruth_path = []
for i, data in enumerate(predict_samples):
    second_remaining = generator.frequency_ts
    time.sleep(second_remaining)
    pushed_date = datetime.datetime.utcnow()
    sample_count = round(random.uniform(0.5 ,0.9),2)
    if i%2:
        p_ts = [
            randomtimestamp(start=pushed_date-datetime.timedelta(seconds=second_remaining),
                            end=pushed_date,pattern = "'%Y-%m-%d %H:%M:%S.%f'") 
            for i in range(len(data["df"]))
            ]
        data["df"]["timestamp"] = p_ts
        p_path = generator.save_dataset(data["df"].sample(frac=sample_count),data["name"],predict_dataset_source)
        if len(predict_path) == 0:
            generator.create_aws_dkube_dataset(os.path.join(f"{generator.monitor_name}/{predict_dataset_source.data_class}"),predict_dataset_source.data_class)
        if p_path:
            predict_path.append(p_path)
    else:
        p_ts = [randomtimestamp(
                start=pushed_date-datetime.timedelta(seconds=second_remaining), end=pushed_date,pattern = "'%Y-%m-%d %H:%M:%S.%f'"
            ) for i in range(len(drift_datasets[i]["df"]))]
        drift_datasets[i]["df"]["timestamp"] = p_ts
        d_path = generator.save_dataset(drift_datasets[i]["df"].sample(frac=sample_count), drift_datasets[i]["name"],predict_dataset_source,pushed_date)
        if len(drift_path) == 0:
            generator.create_aws_dkube_dataset(os.path.join(f"{generator.monitor_name}/{predict_dataset_source.data_class}"),predict_dataset_source.data_class)
        if d_path:
            drift_path.append(d_path)
    groundtruth_samples[i]["df"]["timestamp"] = p_ts[0:len(groundtruth_samples[i]["df"])]
    g_path = generator.save_dataset(groundtruth_samples[i]["df"].sample(frac=sample_count), groundtruth_samples[i]["name"],ground_dataset_source, pushed_date)
    if len(groundtruth_path) == 0:
        generator.create_aws_dkube_dataset(os.path.join(f"{generator.monitor_name}/{ground_dataset_source.data_class}"),ground_dataset_source.data_class)
    if g_path:
        groundtruth_path.append(g_path)
