### Defined by User

In [None]:
%store -r
DATASET_SAMPLES  = 6

In [None]:
MONITOR_NAME = ps_config['MONITOR_NAME']
PRECOMPUTED_DATA_SOURCE = ps_config['PRECOMPUTED_DATA_SOURCE']
PRESCORE_DATASET = ps_config["PRESCORE_DATASET"]
DKUBEUSERNAME = ps_config['DKUBEUSERNAME']
TOKEN = ps_config['TOKEN']
DKUBE_URL = ps_config['DKUBE_URL']
ACCESS_KEY = ps_config['ACCESS_KEY']
SECRET_KEY = ps_config['SECRET_KEY']
DBHOSTNAME = ps_config['DBHOSTNAME']
DATABASENAME = ps_config['DATABASENAME']
DBUSERNAME = ps_config['DBUSERNAME']
DBPASSWORD = ps_config['DBPASSWORD']
DB_PROVIDER = ps_config['DB_PROVIDER']
RUN_FREQUENCY = ps_config['RUN_FREQUENCY']

In [None]:
import os
if not os.getenv("AWS_ACCESS_KEY_ID") or ACCESS_KEY != os.getenv("AWS_ACCESS_KEY_ID"):
    os.environ["AWS_ACCESS_KEY_ID"] = ACCESS_KEY
if not os.getenv("AWS_SECRET_ACCESS_KEY") or SECRET_KEY != os.getenv("AWS_SECRET_ACCESS_KEY"):
    os.environ["AWS_SECRET_ACCESS_KEY"] = SECRET_KEY
if not os.getenv("DBHOSTNAME") or DBHOSTNAME != os.getenv("DBHOSTNAME"):
    os.environ["DBHOSTNAME"]=DBHOSTNAME
if not os.getenv("DBUSERNAME") or DBUSERNAME != os.getenv("DBUSERNAME"):
    os.environ["DBUSERNAME"]=DBUSERNAME
if not os.getenv("DATABASENAME") or DATABASENAME != os.getenv("DATABASENAME"):
    os.environ["DATABASENAME"]=DATABASENAME
if not os.getenv("DBPASSWORD") or DBPASSWORD != os.getenv("DBPASSWORD"):
    os.environ["DBPASSWORD"]=DBPASSWORD

### DATA GENERATION

In [None]:
HOME = os.getenv("HOME")
if HOME:
    EXECUTABLE_DIR = os.path.join(HOME,".local", "bin")
    PATH = os.getenv("PATH")
    if EXECUTABLE_DIR not in PATH:
        os.environ["PATH"] = f"{PATH}:{EXECUTABLE_DIR}"
    PATH = os.getenv("PATH")
if not os.getenv("AWS_BUCKET"):
    os.environ["AWS_BUCKET"] = "mm-workflow"

In [None]:
if DB_PROVIDER == "mssql":
    !sudo curl https://packages.microsoft.com/keys/microsoft.asc | sudo apt-key add -
    !curl https://packages.microsoft.com/config/ubuntu/$(cat /etc/lsb-release | grep DISTRIB_RELEASE | cut -f2 -d'=')/prod.list > /tmp/mssql-release.list
    !sudo cp /tmp/mssql-release.list /etc/apt/sources.list.d/
    !sudo apt-get update -y
    !sudo ACCEPT_EULA=Y apt-get install -y msodbcsql17
    !sudo apt-get install unixodbc-dev -y

In [None]:
import sys
!{sys.executable} -m pip install pymysql Flask-SQLAlchemy boto3 --user > /dev/null
if DB_PROVIDER == "mssql":
    !{sys.executable} -m pip install pyodbc --user
elif DB_PROVIDER == "mysql":
    !{sys.executable} -m pip install pymysql --user

In [None]:
if HOME:
    USR_LOCAL_LIB_PATH = os.path.join(HOME,".local","lib","python3.6","site-packages")
    if USR_LOCAL_LIB_PATH not in sys.path:
        sys.path.append(USR_LOCAL_LIB_PATH)

In [None]:
import io
import re
import os
import time
import uuid
import random
import boto3
import joblib

from dkube.sdk.api import DkubeApi

## Dependencies for data generator 
import numpy as np
import pandas as pd
from configparser import ConfigParser
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn import preprocessing as skpreprocessing
import datetime
from tqdm import tqdm_notebook as tqdm

from enum import Enum
from collections import namedtuple
from joblib import load
from urllib.parse import quote_plus
from sqlalchemy import create_engine

In [None]:
class DBCONFIG:
    def __init__(self, hostname, databasename, username, password, provider="mysql"):
        self.hostname = hostname
        self.databasename = databasename
        self.username = username
        self.password = password
        if provider not in ["mysql", "mssql"]:
            raise ValueError(f"{provider} not a supported provider")
        self.provider = provider
    
    def __str__(self):
        host_split = self.hostname.split(":")
        if len(host_split) == 2:
            if self.provider == "mysql":
                return f"mysql+pymysql://{self.username}:{self.password}@{host_split[0]}:{host_split[1]}/{self.databasename}"
            elif self.provider == "mssql":
                params =  quote_plus("DRIVER={ODBC Driver 17 for SQL Server};"
                                    f"SERVER={','.join(host_split)};"
                                    f"DATABASE={self.databasename};"
                                    f"UID={self.username};"
                                    f"PWD={self.password}")
                return f"mssql+pyodbc:///?odbc_connect={params}"
            else:
                return ""
        else:
            return ""
    
    def __repr__(self):
        return self.__str__()

class DataSource(Enum):
    LOCAL = "local"
    AWS_S3 = "aws_s3"
    SQL = "sql"
    

DatasetSource = namedtuple('DatasetSource', 'model_monitor table frequency_unit data_class add_prefix_ts')

In [None]:
FREQUENCY = f"{RUN_FREQUENCY}m"

MODEL_FREQUENCY = RUN_FREQUENCY

LABELLED_DATASET_TABLE  = "insurance_precomputed"

LABELLED_DATA_CLASS = "groundtruth" #used for s3

PREFIX_LABELLED_DATASET_WITH_TS = False

## By default data source is local, supported are [DataSource.AWS_S3 and DataSource.SQL]

if PRECOMPUTED_DATA_SOURCE == 'local':
    DATASET_SOURCE = DataSource.LOCAL
if PRECOMPUTED_DATA_SOURCE == 'aws-s3':
    DATASET_SOURCE = DataSource.AWS_S3
if PRECOMPUTED_DATA_SOURCE =='sql':
    DATASET_SOURCE = DataSource.SQL
    DBHOSTNAME = os.getenv("DBHOSTNAME")
    DATABASE_NAME = os.getenv("DATABASENAME")
    DBUSERNAME = os.getenv("DBUSERNAME")
    PASSWORD = os.getenv("DBPASSWORD")

In [None]:
class InsuranceDataGenerator:
    # With no parameters or configuration, boto3 will look for
    # access keys in these places:
    # 1. Environment variables (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY)
    # 2. Credentials file (~/.aws/credentials or
    #      C:\Users\USER_NAME\.aws\credentials)
    # 3. AWS IAM role for Amazon EC2 instance
    #    (http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html)

    #    Define a ~/.aws/credentials file as following
    #    [default]
    #    aws_access_key_id=foo
    #    aws_secret_access_key=bar
    #    aws_session_token=baz # might not be required
    BUCKET = None
    S3_CLIENT = None
    DB_ENGINE = None
    API_CLIENT = None
    TOKEN = None
    USERNAME = None

    def __init__(
        self,
        start_time: datetime.datetime = None,
        frequency="1H",
        model_frequency=10,
        duration: str = "10:24:12",
        margin=20,
        db_config:DBCONFIG = None,
        dataset_source: DataSource = DataSource.AWS_S3
    ):
        if not re.fullmatch("^\d+[hmHM]{1}$",frequency):
            raise ValueError("frequency can have [time_value_int][time_unit] time_unit can be case case insensitive out of H, M")
        self.frequency  = frequency
        self.margin=margin
        self.monitor_name = MODEL_FREQUENCY
        self.dataset_source = dataset_source
        self.start_time = start_time if start_time else datetime.datetime.utcnow()
        self.model_frequency = model_frequency
        self.db_config = db_config
            
        self.duration = duration
        klass = type(self)
        if not klass.BUCKET:
            klass.BUCKET = os.getenv("AWS_BUCKET")
        if not klass.S3_CLIENT:
            klass.S3_CLIENT = boto3.client("s3")
        if not klass.TOKEN:
            klass.TOKEN = os.getenv("DKUBE_USER_ACCESS_TOKEN",TOKEN)
        if not klass.USERNAME:
            klass.USERNAME= DKUBEUSERNAME
        if not klass.API_CLIENT:
            klass.API_CLIENT = DkubeApi(URL=os.getenv('DKUBE_URL',DKUBE_URL),token=klass.TOKEN)
        if not klass.DB_ENGINE:
            if str(self.db_config):
                klass.DB_ENGINE = create_engine(str(self.db_config))

        duration = self.duration.split("-")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
    
    @classmethod
    def save_dataset_to_s3(cls, data, monitor_name, name, typeofdata, prefix_dir_with_ts = True, frequency_unit="H",current_date=None):
        file_name = name + ".csv"
        print(filename)
        if not current_date:
            current_date = datetime.datetime.now()
        data_dir = os.path.join(
            monitor_name,
            typeofdata
        )
        if prefix_dir_with_ts:
            data_dir = os.path.join(data_dir, 
            current_date.strftime("%Y"),
            current_date.strftime("%m"),
            current_date.strftime("%d"),
            current_date.strftime("%H"))
            if frequency_unit.lower() =="m":
                data_dir = os.path.join(data_dir,current_date.strftime("%M"))
        file_path = os.path.join(data_dir, file_name)
        with io.StringIO() as csv_buffer:
            data.to_csv(csv_buffer, index=False)

            response = cls.S3_CLIENT.put_object(
                Bucket=cls.BUCKET, Key=file_path, Body=csv_buffer.getvalue()
            )
            status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

            if status == 200:
                print(f"Successful S3 put_object response. Status - {status}")
                return file_path
            else:
                print(f"Unsuccessful S3 put_object response. Status - {status}")
      
    @classmethod
    def save_dataset_to_local(cls,data, name, monitor_name,typeofdata,frequency_unit="H", current_date=None):
        file_name = name + ".csv"
        try:
            data_dir = os.path.dirname(os.path.realpath(__file__))
        except:
            data_dir = os.getcwd()
          
        if not current_date:
            current_date = datetime.datetime.now()
        groundtruth_destination_path = os.path.join(HOME, "dataset", PRESCORE_DATASET,
                                                   cls.API_CLIENT.get_dataset_versions(cls.USERNAME,PRESCORE_DATASET)[0]['version']['uuid'],
                                                   "data")
#         groundtruth_destination_path = HOME+'/dataset/'+PRESCORE_DATASET+cls.API_CLIENT.get_dataset_versions(cls.USERNAME,PRESCORE_DATASET)[0]['version']['uuid']+'/data/'
        print(groundtruth_destination_path)
        data_dir = groundtruth_destination_path
        
        if not os.path.isdir(data_dir):
            os.makedirs(data_dir, exist_ok=True)
        file_path = data_dir+'/'+file_name
        data.to_csv(file_path, index=False)
        
        return file_path
    
    @classmethod
    def save_dataset_to_sql(cls, data, tablename):
        data.to_sql(tablename, cls.DB_ENGINE, if_exists="append", index=False)
    
    def save_dataset(self ,data, data_name:str, config: DatasetSource, current_date=None):
        klass = type(self)
        if self.dataset_source == DataSource.AWS_S3:
            return klass.save_dataset_to_s3(data, config.model_monitor, data_name, config.data_class, config.add_prefix_ts, config.frequency_unit, current_date)
        elif self.dataset_source == DataSource.SQL:
            klass.save_dataset_to_sql(data, config.table)
        elif self.dataset_source == DataSource.LOCAL:
            return klass.save_dataset_to_local(data, data_name, config.model_monitor, config.data_class, config.frequency_unit,current_date)

    @property
    def frequency_ts(self):
        value = int(self.frequency[:-1])
        unit = self.frequency[-1].lower()
        seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
        seconds_count = int(value) * seconds_per_unit[unit]
        now = datetime.datetime.utcnow()
        if unit.lower() == "h":
            delta = datetime.timedelta(hours=value)
            new_time = (now+delta).replace(minute = 0, second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
            second_remaining = (new_time-now).seconds
            result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
            print(f"Next Push after {datetime.timedelta(seconds=result)}")
            return result        
        elif unit == "m":
            diff = abs(now.minute%-value)
            if diff == 0:
                delta = datetime.timedelta(minutes=value)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                result = (new_time-now).seconds
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
            else:
                delta = datetime.timedelta(minutes = diff)
                new_time = (now+delta).replace(second =0, microsecond=0) - datetime.timedelta(seconds=self.margin)
                second_remaining = (new_time-now).seconds
                result =  seconds_count if second_remaining > seconds_count or second_remaining == 0 else second_remaining
                print(f"Next Push after {datetime.timedelta(seconds=result)}")
                return result
        
    @property
    def awsS3Secret(self):
        if PRECOMPUTED_DATA_SOURCE == 'aws_s3':
            AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID",ACCESS_KEY) 
            AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY",SECRET_KEY)
            print(AWS_ACCESS_KEY)
        if AWS_ACCESS_KEY and AWS_SECRET_KEY:
            return {"access_key":AWS_ACCESS_KEY, "secret_key": AWS_SECRET_KEY}
        else:
            home_dir = os.getenv("HOME")
            if home_dir:
                creds_path = os.path.join(home_dir, ".aws","credentials")
                config = ConfigParser()
                if os.path.isfile(creds_path):
                    config.read(creds_path)
                    if "default" in config:
                        AWS_ACCESS_KEY = config["default"]["aws_access_key_id"]
                        AWS_SECRET_KEY = config["default"]["aws_secret_access_key"]
                        if AWS_ACCESS_KEY and AWS_SECRET_KEY:
                            return {"access_key":AWS_ACCESS_KEY, "secret_key": AWS_SECRET_KEY}
                
    @property
    def end(self):
        duration = self.duration.split(":")
        if len(duration) < 2:
            duration.append("0")
            duration.append("0")
        elif len(duration) < 3:
            duration.append("0")
        return self.start_time + datetime.timedelta(
            hours=int(duration[0]), minutes=int(duration[1]), seconds=int(duration[2])
        )

In [None]:
InsuranceDataGenerator.URL = DKUBE_URL
InsuranceDataGenerator.TOKEN = TOKEN
InsuranceDataGenerator.API_CLIENT = DkubeApi(URL=DKUBE_URL, token=TOKEN)

In [None]:
generator = InsuranceDataGenerator(MONITOR_NAME,
                                   frequency=FREQUENCY,
                                   model_frequency = MODEL_FREQUENCY,
                                   db_config = DBCONFIG(
                                       hostname=DBHOSTNAME,
                                       databasename = DATABASENAME,
                                       username = DBUSERNAME,
                                       password = DBPASSWORD,
                                       provider= DB_PROVIDER),
                                   dataset_source = DATASET_SOURCE)

ground_dataset_source = DatasetSource(model_monitor=MONITOR_NAME,
                                      table=LABELLED_DATASET_TABLE,
                                      data_class=LABELLED_DATA_CLASS,
                                      frequency_unit = generator.frequency[-1],
                                      add_prefix_ts=PREFIX_LABELLED_DATASET_WITH_TS)

In [None]:
def generate_precomputed_scores(no_of_scores=3):
    df = {}
    start = datetime.datetime.utcnow()
    end = start + datetime.timedelta(seconds=10)
    df["timestamp"] = pd.date_range(start, end, no_of_scores)
    df["accuracy"] = np.random.uniform(low=0.8, high=0.95, size=no_of_scores)
    df["precision"] = np.random.uniform(low=0.8, high=0.95, size=no_of_scores)
    df["recall"] = np.random.uniform(low=0.8, high=0.95, size=no_of_scores)
    df["roc_auc_score"] = np.random.uniform(low=0.8, high=0.95, size=no_of_scores)
    df["samples"] = np.random.randint(low=80, high=100, size=no_of_scores)
    return pd.DataFrame(df)

In [None]:
ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n//10%10!=1)*(n%10<4)*n%10::4])
push_count = 1
drift_path = []
predict_path = []
groundtruth_path = []
for i in range(DATASET_SAMPLES):
    second_remaining = generator.frequency_ts
#     time.sleep(second_remaining)
    print("Generating precomputed scores")
    precomputed_score = generate_precomputed_scores()
    filename = f"precomputed_score_{i+1}"
    g_path = generator.save_dataset(precomputed_score, filename,ground_dataset_source)
    if g_path:
        groundtruth_path.append(g_path)
    print(f"Pushed data for {ordinal(push_count)} time, Remaining pushes: {DATASET_SAMPLES-push_count}, Monitor name: {MONITOR_NAME}")
    push_count += 1
print("***************** DATA GENERATION COMPLETED ******************************")