Downloading information from available resources and trains an RNN model

In [1]:
import nuclio 
import mlrun
import os

In [None]:
%nuclio env -c V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}
%nuclio env -c V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env -c V3IO_API=${V3IO_API}

In [None]:
%%nuclio config 
kind = "nuclio"
spec.build.baseImage = "mlrun/ml-models"

In [None]:
%%nuclio cmd -c
pip install keras

In [None]:
# nuclio: start-code

In [5]:
import mlrun.feature_store as fs
import mlrun
import datetime
import pandas as pd
from mlrun import MLClientCtx
import os
import numpy as np
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
import keras
import nuclio

In [98]:
def get_data_from_vector(context,ticker):
    last = context.last_trained_times.get(ticker)
    if not last:
        last = datetime.datetime.now()-datetime.timedelta(30)
    # change to read only data from the correct time and correct ticker
    data = fs.get_offline_features(context.vector,entity_timestamp_column="Datetime",
                               start_time = last,
                               end_time = datetime.datetime.now()).to_dataframe()
    
    # after changing to correct reading, remove all this section
    data = data[data["ticker"] == ticker]
    context.last_trained_times["ticker"] = datetime.datetime.now()-datetime.timedelta(1)
    return data

In [99]:
def get_model(X_train):
    regressor = Sequential()
    regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
    regressor.add(Dropout(0.2))
    regressor.add(LSTM(units = 50, return_sequences = True))
    regressor.add(Dropout(0.2))
    regressor.add(LSTM(units = 50, return_sequences = True))
    regressor.add(Dropout(0.2))
    regressor.add(LSTM(units = 50))
    regressor.add(Dropout(0.2))
    regressor.add(Dense(units =  1))
    
    #Compiling and fitting the model
    regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')
    return regressor

In [100]:
def modify_data(context,ticker_data):
    ticker_to_int = {'GOOGL' : 0,'MSFT' : 1,'AMZN' : 2,'AAPL' : 3,'INTC' : 4}
    ticker_data["ticker"] = ticker_data["ticker"].apply(lambda x: ticker_to_int.get(x))
    for col in [x for x in ticker_data.columns if "Open" in x or "Close" in x or "High" in x or "Low" in x]:
        ticker_data[[col]] = context.priceMMS.fit_transform(ticker_data[[col]])
    for col in [x for x in ticker_data.columns if "Volume" in x]:
        ticker_data[[col]] = context.volumeMMS.fit_transform(ticker_data[[col]])
    for col in [x for x in ticker_data.columns if "Sentiment" in x]:
        ticker_data[[col]] = context.sentimentMMS.fit_transform(ticker_data[[col]])
        
    X_train = []
    y_train = []
    size_of_stamps = 10
    data = ticker_data.values
    closing = ticker_data["Close"].values
    for i in range(size_of_stamps, data.shape[0]):
        X_train.append(data[i-size_of_stamps:i])
        y_train.append(closing[i])
        
    X_train = np.asarray(X_train).astype('float32')
    y_train = np.asarray(y_train).astype('float32')
    
    where_are_NaNs = np.isnan(X_train)
    X_train[where_are_NaNs] = 0
    return X_train,y_train

In [152]:
def handler(context,event):   
    model_path = os.getenv('model_path', 'mymodel.h5')
    all_data = pd.DataFrame()
    for ticker in context.sym_to_url.keys():
        ticker_data = get_data_from_vector(context,ticker)
        all_data = all_data.append(ticker_data)
        
    X_train,y_train = modify_data(context,all_data) 
    context.logger.info("Finished modifieing data")
    
    if os.path.exists(model_path):
        context.logger.info("Previously trained model loaded")
        model = keras.models.load_model(model_path)
    else:
        context.logger.info("New model created")
        model = get_model(X_train)
        
    model.fit(X_train, y_train, epochs = 2, batch_size = 32)
    context.logger.info("Done training")
    model.save(model_path)
    context.logger.info(f"model saved at {model_path}")

In [178]:
def init_context(context):
    context.logger.info("Initalizing context")
    setattr(context, 'PROJECT_NAME', os.getenv('PROJECT_NAME', 'stocks-' + os.getenv('V3IO_USERNAME')+"2"))
    mlrun.set_environment(project=context.PROJECT_NAME)
    
    last_trained_times = {}
    setattr(context, 'last_trained_times', last_trained_times)

    sym_to_url = {'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc',
                'AAPL': 'apple-computer-inc', 'INTC' : 'intel-corp'}
    setattr(context, 'sym_to_url', sym_to_url)
        
    features = ["stocks.Opens_min_1h",
                "stocks.Opens_max_1h",
                "stocks.Volumes_min_1h",
                "stocks.Volumes_max_1h",
                "stocks.Open",
                "stocks.High",
                "stocks.Low",
                "stocks.Close",
                "stocks.Volume",
                "stocks.ticker",
                "news.Sentiment"]
    vector = fs.FeatureVector("stocks-vec", features, description="stocks demo feature vector")
    vector.save()
    setattr(context,'vector',vector)
    
    setattr(context,"priceMMS",MinMaxScaler(feature_range = (0, 1)))
    setattr(context,"volumeMMS",MinMaxScaler(feature_range = (0, 1)))
    setattr(context,"sentimentMMS",MinMaxScaler(feature_range = (0, 1)))
    context.logger.info("Finished Initalizing context")

In [179]:
# nuclio: end-code

In [182]:
mlrun.set_environment(project="stocks3-dani")

('stocks3-dani', 'v3io:///projects/{{run.project}}/artifacts')

In [183]:
context = mlrun.get_or_create_ctx(name="stocks3-dani")
init_context(context)
event = ""
handler(context,event)

> 2021-06-29 09:06:39,288 [info] logging run results to: http://mlrun-api:8080
> 2021-06-29 09:06:39,391 [info] Initalizing context
> 2021-06-29 09:06:39,528 [info] Finished Initalizing context


MLRunInvalidArgumentError: feature stocks.Opens_min_1h not found in feature set stocks

In [158]:
# test remote deployment
from mlrun import code_to_function

fn = code_to_function('model_trainer',
                      handler='handler')

# Set parameters for current deployment
fn.set_envs({'PROJECT_NAME' : "stocks2-" + os.getenv('V3IO_USERNAME'),
             'model_path' : os.getcwd()+'/mymodel2.h5'})
fn.spec.max_replicas = 2

In [159]:
addr = fn.deploy(project="stocks2-" + os.getenv('V3IO_USERNAME'))

> 2021-06-29 07:44:56,477 [info] Starting remote function deploy
2021-06-29 07:44:56  (info) Deploying function
2021-06-29 07:44:56  (info) Building
2021-06-29 07:44:56  (info) Staging files and preparing base images
2021-06-29 07:44:57  (info) Building processor image
2021-06-29 07:45:02  (info) Build complete
2021-06-29 07:45:08  (info) Function deploy complete
> 2021-06-29 07:45:09,119 [info] function deployed, address=default-tenant.app.vmdev31.lab.iguazeng.com:32241


In [186]:
mlrun.set_environment(project="stocks3-dani")

('stocks3-dani', 'v3io:///projects/{{run.project}}/artifacts')

In [187]:
features = ["stocks.*",
            "news.*"]
vector = fs.FeatureVector("stocks-vec", features, description="stocks demo feature vector")

In [188]:
data = fs.get_offline_features(vector).to_dataframe()

MLRunNotFoundError: 404 Client Error: Not Found for url: http://mlrun-api:8080/api/projects/stocks3-dani/feature-sets/stocks/references/latest: None

In [None]:
data