### Defined by User

In [None]:
%store -r
FREQUENCY = "5m"

In [None]:
import os
if os.getenv("AWS_ACCESS_KEY_ID")==None:
    os.environ["AWS_ACCESS_KEY_ID"] = ""
if os.getenv("AWS_SECRET_ACCESS_KEY")==None:
    os.environ["AWS_SECRET_ACCESS_KEY"] = ""
if os.getenv("DBHOSTNAME")==None:
    os.environ["DBHOSTNAME"]= ""
if os.getenv("DBUSERNAME")==None:
    os.environ["DBUSERNAME"]= ""
if os.getenv("DATABASENAME")==None:
    os.environ["DATABASENAME"]= ""
if os.getenv("DBPASSWORD")==None:
    os.environ["DBPASSWORD"]= ""

### DATA GENERATION

In [None]:
import logging
from os.path import exists

HOME = os.getenv("HOME")
if HOME:
    EXECUTABLE_DIR = os.path.join(HOME,".local", "bin")
    PATH = os.getenv("PATH")
    if EXECUTABLE_DIR not in PATH:
        os.environ["PATH"] = f"{PATH}:{EXECUTABLE_DIR}"
    PATH = os.getenv("PATH")
if not os.getenv("AWS_BUCKET"):
    os.environ["AWS_BUCKET"] = "mm-workflow"

In [None]:
import sys
!{sys.executable} -m pip install kfp==1.4.0 kfp-server-api==1.2.0 --user &> /dev/null
!{sys.executable} -m pip install pymysql --user
!{sys.executable} -m pip install Flask-SQLAlchemy
#!{sys.executable} -m pip install --upgrade pyodbc --user

In [None]:
if HOME:
    USR_LOCAL_LIB_PATH = os.path.join(HOME,".local","lib","python3.6","site-packages")
    if USR_LOCAL_LIB_PATH not in sys.path:
        sys.path.append(USR_LOCAL_LIB_PATH)

In [None]:
import json
import kfp
import kfp.dsl as dsl
import kfp.compiler as compiler
from kfp import components
import tensorflow as tf
from kubernetes import client as k8s_client
import os
import json
import kfp
import requests
import string
import random
import kfp.dsl as dsl
import kfp.compiler as compiler
from kfp import components
from kubernetes import client as k8s_client

from dkube.sdk.api import DkubeApi
from dkube.sdk.rsrcs import DkubeCode
from dkube.sdk.rsrcs import DkubeDataset
from dkube.sdk.rsrcs import DkubeModel


## Dependencies for data generator 
import numpy as np
import pandas as pd
from configparser import ConfigParser
import numpy.random,argparse,uuid
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import boto3
import time
import joblib
from sklearn import preprocessing as skpreprocessing
import datetime
from tqdm import tqdm_notebook as tqdm
import io
import re
from enum import Enum
from collections import namedtuple
from joblib import dump, load
from sqlalchemy import create_engine
#import pyodbc

In [None]:
class DBCONFIG:
    def __init__(self, hostname, databasename, username, password):
        self.hostname = hostname
        self.databasename = databasename
        self.username = username
        self.password = password
    
    def __str__(self):
        return f"mysql+pymysql://{self.username}:{self.password}@{self.hostname}/{self.databasename}"
    def __repr__(self):
        return f"mysql+pymysql://{self.username}:{self.password}@{self.hostname}/{self.databasename}"

class DataSource(Enum):
    LOCAL = "local"
    AWS_S3 = "aws_s3"
    SQL = "sql"
    

DatasetSource = namedtuple('DatasetSource', 'model_monitor table frequency_unit data_class add_prefix_ts')

In [None]:
REFERENCE_DATA_S3_PATH = "https://dkube-examples-data.s3.us-west-2.amazonaws.com/monitoring-titanic/training-data/titanic.csv"

MODEL_FREQUENCY = 4

DATASET_SAMPLES  = 6

PREDICT_DATASET_TABLE   = "titanic_predict" 
LABELLED_DATASET_TABLE  = "titanic_gt"

PREDICT_DATA_CLASS = "predict" # used for s3 
LABELLED_DATA_CLASS = "groundtruth" #used for s3

PREFIX_PREDICT_DATASET_WITH_TS = True
PREFIX_LABELLED_DATASET_WITH_TS = False

## By default data source is local, supported are [DataSource.AWS_S3 and DataSource.SQL]

if DATA_SOURCE == 'local':
    DATASET_SOURCE = DataSource.LOCAL
if DATA_SOURCE == 'aws_s3':
    DATASET_SOURCE = DataSource.AWS_S3
if DATA_SOURCE =='sql':
    DATASET_SOURCE = DataSource.SQL
    DBHOSTNAME = os.getenv("DBHOSTNAME")
    DATABASE_NAME = os.getenv("DATABASENAME")
    DBUSERNAME = os.getenv("DBUSERNAME")
    PASSWORD = os.getenv("DBPASSWORD")

# MODEL FOR PREDICTION
model = tf.keras.models.load_model('model/')

In [None]:
class TitanicDataGenerator:
    # With no parameters or configuration, boto3 will look for
    # access keys in these places:
    # 1. Environment variables (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY)
    # 2. Credentials file (~/.aws/credentials or
    #      C:\Users\USER_NAME\.aws\credentials)
    # 3. AWS IAM role for Amazon EC2 instance
    #    (http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html)

    #    Define a ~/.aws/credentials file as following
    #    [default]
    #    aws_access_key_id=foo
    #    aws_secret_access_key=bar
    #    aws_session_token=baz # might not be required
    BUCKET = None
    S3_CLIENT = None
    DB_ENGINE = None
    API_CLIENT = None
    TOKEN = None
    USERNAME = None

    def __init__(
        self,
        data_reference_s3path,
        monitor_name,
        n_predict_datasets: int = 1,
        n_groundtruth_datasets: int = 1,
        n_drift_datasets: int = 1,
        start_time: datetime.datetime = None,
        frequency="1H",
        model_frequency=10,
        duration: str = "10:24:12",
        margin=20,
        db_config:DBCONFIG = None,
        dataset_source: DataSource = DataSource.AWS_S3
    ):
        if not re.fullmatch("^\d+[hmHM]{1}$",frequency):
            raise ValueError("frequency can have [time_value_int][time_unit] time_unit can be case case insensitive out of H, M")
        self.n_predict_datasets = n_predict_datasets
        self.frequency  = frequency
        self.margin=margin
        self.monitor_name = monitor_name
        self.data_reference_s3path = data_reference_s3path
        self.n_groundtruth_datasets = n_drift_datasets
        self.n_drift_datasets = n_drift_datasets
        self.dataset_source = dataset_source
        self.start_time = start_time if start_time else datetime.datetime.utcnow()
        self.drift_seeds = [772, 775, 777, 779, 781, 782, 784, 785, 786, 788, 789, 790, 791, 792, 794, 798, 799, 800, 801, 802]
        self.input_features = ['Age','Fare','Pclass','SibSp','Parch','Sex_female','Sex_male']
        self.model_frequency = model_frequency
        self.predict_start = datetime.datetime.utcnow()
        self.drift_start = datetime.datetime.utcnow()
        self.train_data = None
        self.pclass_values = None
        self.sibsp_values = None
        self.parch_values = None
        self.sex_female_values = None
        self.sex_female_values = None
        self.age_min = self.age_max = None
        self.fare_min = self.fare_max = None
        self.model = None
        self.init_train_data()
        self.load_model()
        
        if n_groundtruth_datasets > n_predict_datasets or n_drift_datasets > n_predict_datasets:
            raise Exception("GroundTruth datasets or drift_datsets cant be greater than predict datasets")
        self.db_config = db_config
            
        self.duration = duration
        klass = type(self)
        if not klass.BUCKET:
            klass.BUCKET = os.getenv("AWS_BUCKET")
        if not klass.S3_CLIENT:
            klass.S3_CLIENT = boto3.client("s3")
        if not klass.TOKEN:
            klass.TOKEN = os.getenv("DKUBE_USER_ACCESS_TOKEN",TOKEN)
        if not klass.USERNAME:
            klass.USERNAME= DKUBEUSERNAME
        if not klass.API_CLIENT:
            klass.API_CLIENT = DkubeApi(URL=os.getenv('DKUBE_URL',DKUBE_URL),token=klass.TOKEN)
        if not klass.DB_ENGINE:
            if self.db_config:
                klass.DB_ENGINE = create_engine(str(self.db_config))

        duration = self.duration.split("-")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
 
    def init_train_data(self):
        titanic = self.train_df
        self.train_data = titanic
        self.age_min, self.age_max = titanic["Age"].min(), titanic["Age"].max()
        self.fare_min, self.fare_max = titanic["Fare"].min(), titanic["Fare"].max()
        self.pclass_values = titanic["Pclass"].unique().tolist()
        self.sibsp_values = titanic["SibSp"].unique().tolist()
        self.parch_values = titanic["Parch"].unique().tolist()
        self.sex_female_values = titanic["Sex_female"].unique().tolist()
        self.sex_male_values = titanic["Sex_male"].unique().tolist()
    
    
    def load_model(self):
        self.model = tf.keras.models.load_model('model/')
    
    
    @classmethod
    def save_dataset_to_s3(cls, data, monitor_name, name, typeofdata, prefix_dir_with_ts = True, frequency_unit="H",current_date=None):
        file_name = name + ".csv"
        if not current_date:
            current_date = datetime.datetime.now()
        data_dir = os.path.join(
            monitor_name,
            typeofdata
        )
        if prefix_dir_with_ts:
            data_dir = os.path.join(data_dir, 
            current_date.strftime("%Y"),
            current_date.strftime("%m"),
            current_date.strftime("%d"),
            current_date.strftime("%H"))
            if frequency_unit.lower() =="m":
                data_dir = os.path.join(data_dir,current_date.strftime("%M"))
        file_path = os.path.join(data_dir, file_name)
        with io.StringIO() as csv_buffer:
            data.to_csv(csv_buffer, index=False)

            response = cls.S3_CLIENT.put_object(
                Bucket=cls.BUCKET, Key=file_path, Body=csv_buffer.getvalue()
            )
            status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

            if status == 200:
                print(f"Successful S3 put_object response. Status - {status}")
                return file_path
            else:
                print(f"Unsuccessful S3 put_object response. Status - {status}")
      

    
    @classmethod
    def save_dataset_to_local(cls,data, name, monitor_name,typeofdata,frequency_unit="H", current_date=None):
        file_name = name + ".csv"
        try:
            data_dir = os.path.dirname(os.path.realpath(__file__))
        except:
            data_dir = os.getcwd()
          
        if not current_date:
            current_date = datetime.datetime.now()
            
        
        predict_destination_path = HOME+'/dataset/'+MONITOR_NAME+'-predict/'+cls.API_CLIENT.get_dataset_versions(cls.USERNAME,MONITOR_NAME+'-predict')[0]['version']['uuid']+'/data/'
        groundtruth_destination_path = HOME+'/dataset/'+MONITOR_NAME+'-groundtruth/'+cls.API_CLIENT.get_dataset_versions(cls.USERNAME,MONITOR_NAME+'-groundtruth')[0]['version']['uuid']+'/data'
        
        if typeofdata=='predict':
            data=data.drop('GT_target',axis=1)
            data_dir = os.path.join(
            predict_destination_path,
            current_date.strftime("%Y"),
            current_date.strftime("%m"),
            current_date.strftime("%d"),
            current_date.strftime("%H"),
        )
            if frequency_unit.lower() =="m":
                data_dir = os.path.join(data_dir,current_date.strftime("%M"))
            
        else:
            data_dir = groundtruth_destination_path
        
        if not os.path.isdir(data_dir):
            os.makedirs(data_dir, exist_ok=True)
        file_path = data_dir+'/'+file_name
        data.to_csv(file_path, index=False)
        return file_path
    
    
    
    @classmethod
    def save_dataset_to_sql(cls, data, tablename):
        data.to_sql(tablename, cls.DB_ENGINE, if_exists="replace", index=False)
    
    def save_dataset(self ,data, data_name:str, config: DatasetSource, current_date=None):
        klass = type(self)
        if self.dataset_source == DataSource.AWS_S3:
            return klass.save_dataset_to_s3(data, config.model_monitor, data_name, config.data_class, config.add_prefix_ts, config.frequency_unit, current_date)
        elif self.dataset_source == DataSource.SQL:
            klass.save_dataset_to_sql(data, config.table)
        elif self.dataset_source == DataSource.LOCAL:
            return klass.save_dataset_to_local(data, data_name, config.model_monitor, config.data_class, config.frequency_unit,current_date)

    @property
    def frequency_ts(self):
        value = int(self.frequency[:-1])
        unit = self.frequency[-1].lower()
        seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
        seconds_count = int(value) * seconds_per_unit[unit]
        now = datetime.datetime.utcnow()
        if unit.lower() == "h":
            delta = datetime.timedelta(hours=value)
            new_time = (now+delta).replace(minute = 0, second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
            second_remaining = (new_time-now).seconds
            result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
            print(f"Next Push after {datetime.timedelta(seconds=result)}")
            return result        
        elif unit == "m":
            diff = abs(now.minute%-value)
            if diff == 0:
                delta = datetime.timedelta(minutes=value)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                result = (new_time-now).seconds
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
            else:
                delta = datetime.timedelta(minutes = diff)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                second_remaining = (new_time-now).seconds
                result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
        

    
    @property
    def awsS3Secret(self):
        if DATA_SOURCE == 'aws_s3':
            AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID",ACCESS_KEY) 
            AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY",SECRET_KEY)
            print(AWS_ACCESS_KEY)
        if AWS_ACCESS_KEY and AWS_SECRET_KEY:
            return {"access_key":AWS_ACCESS_KEY, "secret_key": AWS_SECRET_KEY}
        else:
            home_dir = os.getenv("HOME")
            if home_dir:
                creds_path = os.path.join(home_dir, ".aws","credentials")
                config = ConfigParser()
                if os.path.isfile(creds_path):
                    config.read(creds_path)
                    if "default" in config:
                        AWS_ACCESS_KEY = config["default"]["aws_access_key_id"]
                        AWS_SECRET_KEY = config["default"]["aws_secret_access_key"]
                        if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                            return {"access_key":AWS_ACCESS_KEY, "secret_key": AWS_SECRET_KEY}
                
        
    @property
    def end(self):
        duration = self.duration.split(":")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
        return self.start_time + datetime.timedelta(
            hours=int(duration[0]), minutes=int(duration[1]), seconds=int(duration[2])
        )


    @classmethod
    def get_df_from_s3(cls, path):
        response = cls.S3_CLIENT.get_object(Bucket=cls.BUCKET, Key=path)
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
        if status == 200:
            print(f"Successful S3 get_object response. Status - {status}")
            data = pd.read_csv(response.get("Body"))
            return data
        else:
            print(f"Unsuccessful S3 get_object response. Status - {status}")
    
    @property
    def train_df(self):
        data = pd.read_csv('https://dkube-examples-data.s3.us-west-2.amazonaws.com/monitoring-titanic/training-data/titanic.csv')
        data["Age"].fillna(value=data["Age"].median(), inplace=True)
        data = data[data["Fare"] < 100]
        features = ["Pclass", "Sex", "SibSp", "Parch"]
        y = data["Survived"]
        train_df = pd.get_dummies(data[features])
        data = pd.concat([data[["Age", "Fare","Survived","PassengerId","timestamp"]], train_df], axis=1)
        train = data
        a = np.arange(0, train.shape[1])
        train_aug = pd.DataFrame(
            index=train.index, columns=train.columns, dtype="int"
        )

        for i in tqdm(range(0, len(train))):
            AUG_FEATURE_RATIO = 0.5
            AUG_FEATURE_COUNT = np.floor(train.shape[1] * AUG_FEATURE_RATIO).astype(
                "int"
            )

            aug_feature_index = np.random.choice(
                train.shape[1], AUG_FEATURE_COUNT, replace=False
            )
            aug_feature_index.sort()

            feature_index = np.where(np.logical_not(np.in1d(a, aug_feature_index)))[0]

            train_aug.iloc[i, feature_index] = train.iloc[i, feature_index]

            rand_row_index = np.random.choice(
                len(train), len(aug_feature_index), replace=True
            )

            for n, j in enumerate(aug_feature_index):
                train_aug.iloc[i, j] = train.iloc[rand_row_index[n], j]

        
        for col in ["Pclass","SibSp","Parch","Sex_male","Sex_female","Survived","PassengerId"]:
            train_aug[col] = train_aug[col].astype(int)
            
        train_all = pd.concat([data, train_aug])
        return train_all
    
    def generate_all_predict(self): # All features will not have drift
        count_samples = 0
        for i in range(1, self.n_predict_datasets+1):
            no_of_samples = np.random.randint(80,100)
            count_samples += no_of_samples
            predict_df = self.train_data[self.input_features].sample(no_of_samples)
            y_pred = self.model.predict(predict_df[self.input_features])
            y_pred[y_pred <= 0.5] = 0
            y_pred[y_pred > 0.5] = 1
            predict_df['Survived'] = y_pred
            for col in ["Survived","Sex_female","Sex_male"]:
                predict_df[col] = predict_df[col].astype(int)
            pred_data_name = str(i) + "_predict_data"
            yield {"name": pred_data_name, "df": predict_df}
    
     
    def generate_gt_samples(self,predict_data, data_type):
        for i, data in enumerate(predict_data):
            gt_data = data["df"]
            if data_type == "predict":
                gt_data["GT_target"] = gt_data["Survived"]
                gt_data = gt_data.drop(["Survived"], axis=1)
                gt_name = str(i+1) + "_GTpredict_data"
            else:
                Survived_list = gt_data["Survived"].tolist()
                for j in range(int(len(gt_data)*0.05)):
                    if Survived_list[j] == 0:
                        Survived_list[j]=1
                    else:
                        Survived_list[j]=0
                gt_data["GT_target"] = Survived_list
                gt_name = str(i+1) + "_GTdrift_data"
            if i > self.n_groundtruth_datasets:
                return
            yield {"name": gt_name, "df": gt_data}
    
    
    def generate_random_drift(self): 
        count_samples = 0
        for i in range(1,self.n_drift_datasets):
            seed = np.random.choice(self.drift_seeds)
            np.random.seed(seed)
            start = self.predict_start
            end = start + datetime.timedelta(minutes=self.model_frequency)
            no_of_samples = np.random.randint(80,100)
            count_samples += no_of_samples
            predict_df = pd.DataFrame({
                'Age' : np.random.randint(self.age_min, self.age_max,no_of_samples).tolist(),
                'Fare' : np.random.randint(self.fare_min, self.fare_max,no_of_samples).tolist(),
                'Pclass' : np.random.choice(self.pclass_values, no_of_samples).tolist(),
                'SibSp' : np.random.choice(self.sibsp_values, no_of_samples).tolist(),
                'Parch' : np.random.choice(self.parch_values, no_of_samples).tolist(),
                'Sex_female': np.random.choice(self.sex_female_values, no_of_samples).tolist(),
                'Sex_male': np.random.choice(self.sex_male_values, no_of_samples).tolist(),
            })
            y_pred = self.model.predict(predict_df[self.input_features])
            y_pred[y_pred <= 0.5] = 0
            y_pred[y_pred > 0.5] = 1
            predict_df["Survived"] = y_pred
            predict_df["Survived"] = predict_df["Survived"].astype(int)
            drift_data_name = str(i) + "_drifted_data"
            yield {"name": drift_data_name, "df": predict_df}  
 

In [None]:
generator = TitanicDataGenerator(REFERENCE_DATA_S3_PATH,
                                   MONITOR_NAME,
                                   DATASET_SAMPLES,
                                   DATASET_SAMPLES,
                                   DATASET_SAMPLES,
                                   frequency=FREQUENCY,
                                   model_frequency = MODEL_FREQUENCY,
                                   db_config = DBCONFIG(
                                       hostname=DBHOSTNAME,
                                       databasename = DATABASENAME,
                                       username = DBUSERNAME,
                                       password = DBPASSWORD),
                                   dataset_source = DATASET_SOURCE)

predict_dataset_source = DatasetSource(model_monitor=MONITOR_NAME,
                                       table=PREDICT_DATASET_TABLE, 
                                       frequency_unit = generator.frequency[-1],
                                       data_class=PREDICT_DATA_CLASS,
                                       add_prefix_ts=PREFIX_PREDICT_DATASET_WITH_TS)
ground_dataset_source = DatasetSource(model_monitor=MONITOR_NAME,
                                      table=LABELLED_DATASET_TABLE,
                                      data_class=LABELLED_DATA_CLASS,
                                      frequency_unit = generator.frequency[-1],
                                      add_prefix_ts=PREFIX_LABELLED_DATASET_WITH_TS )


# prediction data with no drift
predict_samples = list(generator.generate_all_predict())

groundtruth_samples= list(generator.generate_gt_samples(predict_samples, "predict"))


## drift in some features


drift_datasets = list(generator.generate_random_drift())
drift_groundtruth_samples= list(generator.generate_gt_samples(drift_datasets, "drift"))

In [None]:
len(predict_samples), len(groundtruth_samples), len(drift_datasets), len(drift_groundtruth_samples)

In [None]:
if DATASET_SOURCE==DataSource.AWS_S3:
    training_path = TitanicDataGenerator.save_dataset_to_s3(generator.train_df, generator.monitor_name,"training","training",False)
if DATASET_SOURCE==DataSource.SQL:
    training_path = TitanicDataGenerator.save_dataset_to_sql(generator.train_df,"titanic")

drift_path = []
predict_path = []
groundtruth_path = []
count_samples = 0
for i, data in enumerate(predict_samples):
    second_remaining = generator.frequency_ts
    current_time = datetime.datetime.utcnow()
    time.sleep(second_remaining)
    pushed_date = datetime.datetime.utcnow()
    sample_count = round(random.uniform(0.5 ,0.9),2)
    if i%2:
        p_ts = pd.date_range(current_time, pushed_date, len(data["df"]))
        data["df"]["timestamp"] = p_ts
        count_samples += len(data["df"])
        data["df"]["PassengerId"] = range(count_samples-len(data["df"])+1,count_samples+1)
        p_path = generator.save_dataset(data["df"],data["name"],predict_dataset_source)
        if p_path:
            predict_path.append(p_path)
        groundtruth_samples[i]["df"]["timestamp"] = pd.date_range(current_time, pushed_date, len(groundtruth_samples[i]["df"]))
        groundtruth_samples[i]["df"]["PassengerId"]  = data["df"]["PassengerId"]
        g_path = generator.save_dataset(groundtruth_samples[i]["df"], groundtruth_samples[i]["name"],ground_dataset_source, pushed_date)

        if g_path:
            groundtruth_path.append(g_path)
        if DATASET_SOURCE!=DataSource.SQL:
            print(p_path, g_path)
    else:
        p_ts = pd.date_range(current_time, pushed_date, len(drift_datasets[i]["df"]))
        drift_datasets[i]["df"]["timestamp"] = p_ts
        count_samples+= len(drift_datasets[i]["df"])
        if i ==1:
            drift_datasets[i]["df"]["PassengerId"] = range(1,len(drift_datasets[i]["df"]))
        else:
            drift_datasets[i]["df"]["PassengerId"] = range(count_samples-len(drift_datasets[i]["df"])+1,count_samples+1)
        
        d_path = generator.save_dataset(drift_datasets[i]["df"], drift_datasets[i]["name"],predict_dataset_source,pushed_date)
        if d_path:
            drift_path.append(d_path)
        drift_groundtruth_samples[i]["df"]["timestamp"] = pd.date_range(current_time, pushed_date, len(drift_groundtruth_samples[i]["df"]))
        drift_groundtruth_samples[i]["df"]["PassengerId"] = drift_datasets[i]["df"]["PassengerId"]
        g_path = generator.save_dataset(drift_groundtruth_samples[i]["df"], drift_groundtruth_samples[i]["name"],ground_dataset_source, pushed_date)
        if g_path:
            groundtruth_path.append(g_path)
        if DATASET_SOURCE!=DataSource.SQL:
            print(d_path, g_path)

logging.info("***************** DATA GENERATION COMPLETED ******************************")