In [1]:
!pip install datacompy



In [2]:
# nuclio: ignore
import nuclio
import v3io_frames as v3f
import os
import v3io.dataplane
import json
import mlrun.feature_store as fs
from mlrun.feature_store.steps import *
import pandas as pd
import mlrun
import datacompy

In [3]:
%nuclio env -c V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}
%nuclio env -c V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env -c V3IO_API=${V3IO_API}

In [4]:
# nuclio: start-code

In [5]:
def init_context(context):
    context.logger.info("Initializing Data-checkup Context")
    setattr(context, 'PROJECT_NAME', os.getenv('PROJECT_NAME', 'stocks-' + os.getenv('V3IO_USERNAME')))
    mlrun.set_environment(project=context.PROJECT_NAME)
    setattr(context,"V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY",None))
    setattr(context,"V3IO_FRAMESD", os.getenv("V3IO_FRAMESD",'framesd:8081'))
    setattr(context,"V3IO_API", os.getenv("V3IO_API",None))
    setattr(context, 'stocks_kv', os.getenv('STOCKS_KV', os.getenv('V3IO_USERNAME') + '/stocks/stocks_kv'))
    setattr(context, 'stocks_stream', os.getenv('STOCKS_STREAM', os.getenv('V3IO_USERNAME') + '/stocks/stocks_stream'))
    setattr(context, 'stocks_tsdb', os.getenv('STOCKS_TSDB_TABLE', os.getenv('V3IO_USERNAME') + '/stocks/stocks_tsdb'))
    setattr(context, 'container', os.getenv('V3IO_CONTAINER', 'users'))
    setattr(context, 'limit', os.getenv('LIMIT', 50))
    
    sym_to_url = {'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc',
                  'AAPL': 'apple-computer-inc', 'INTC' : 'intel-corp'}
    setattr(context, 'sym_to_url', os.getenv('sym_to_url', sym_to_url))
    
    # setting up feature vector
    setattr(context, 'stocks_vec', "stocks-vec")

    # Setting up v3io client
    client = v3f.Client(context.V3IO_FRAMESD, container=os.getenv('V3IO_CONTAINER', 'users'), token=context.V3IO_ACCESS_KEY)
    setattr(context, 'v3io_client', client) 
    
    # Setting up stream
    dataplane_client = v3io.dataplane.Client(endpoint=context.V3IO_API, access_key=context.V3IO_ACCESS_KEY)
    setattr(context, 'dataplane_client', dataplane_client) 
    
    # loading shardes from stream, getting respone
    resp = context.dataplane_client.seek_shard(container=context.container, path=f'{context.stocks_stream}/0', seek_type='EARLIEST')
    setattr(context, 'next_location', resp.output.location)

In [19]:
def check_difference(context,df1,df2):
    d=datacompy.Compare(df1,df2,join_columns='symbol')
    if(d.df1_unq_rows.shape[0] > 0):
        context.logger.info('Feature vector is not updated with the latest data')
        context.logger.info(f'unmatched row : {d.df1_unq_rows}')

In [20]:
def handler(context):
    # read from KV
    kv_df = context.v3io_client.read("kv",table = context.stocks_kv)
    kv_df.reset_index(inplace = True, drop = False)
    # read from stream
    resp = context.dataplane_client.get_records(container=context.container, path=f'{context.stocks_stream}/0', location=context.next_location, limit=context.limit)
    
    # getting the data from the stream
    all_data = []
    for rec in resp.output.records:
        rec_data = rec.data.decode('utf-8')
        all_data.append(rec_data)
        
    # converting the data into a dict - for convenient 
    data_as_dict = []
    for piece in all_data:
        tag_dict = json.loads(str(piece))
        data_as_dict.append(tag_dict)
        
    # converting to a dataframe and finding the most updated tickers timestamp
    stream_df = pd.DataFrame(data_as_dict)
    tickers = set(stream_df["symbol"])
    most_recent = {}
    for ticker in tickers:
        ticker_df = stream_df[stream_df["symbol"] == ticker]
        timestamp = max(ticker_df["time"])
        most_recent[ticker] = timestamp
    most_recent = pd.DataFrame.from_dict(most_recent,orient = "index")
    
    # reading from feature vector
    stock_list = []
    service = fs.get_online_feature_service(context.stocks_vec)
    for key,value in context.sym_to_url.items():
        data = service.get([{"symbol": key}])[0]
        data["symbol"] = key
        stock_list.append(data)
    vector_df = pd.DataFrame(stock_list)
    
    vec_x_kv = vector_df[[col for col in kv_df.columns]]
    check_difference(context,vec_x_kv,kv_df)
    return check_difference(context,vec_x_kv,kv_df)

In [21]:
# nuclio: end-code

In [22]:
init_context(context)
s = handler(context)

Python> 2021-04-06 08:06:16,897 [info] Initializing Data-checkup Context
(0, 6)
(0, 6)


In [None]:
!curl {}