# Nuclio - Training function

## Environment

In [1]:
# nuclio: ignore
import nuclio

## Function

### Imports

In [1]:
# Utils
import os
import time
import yaml
import pandas as pd
import datetime
import itertools
import pickle

# DB Connection
import v3io_frames as v3f

# Parallelization
import dask.dataframe as dd
from dask.distributed import Client

# Function
import dask_ml.model_selection as dcv
import xgboost as xgb
from mlrun import get_or_create_ctx

In [2]:
mlruncontext = get_or_create_ctx('training')

[mlrun] 2020-03-10 18:07:07,814 logging run results to: http://10.233.60.111:8080


In [None]:

# Netops features table
features_table=os.path.join(str(mlruncontext.get_param('NETAPP_MOUNT_PATH')),mlruncontext.get_param('FEATURES_TABLE', 'netops_features'))
mlruncontext.logger.info("FEATURES %s"%features_table)
# Set time to train on
train_on_last = mlruncontext.get_param('TRAIN_ON_LAST', '7d')

# Get saving configuration
is_from_tsdb = (int(mlruncontext.get_param('FROM_TSDB', 0)) == 1)

 # Create saving directory if needed
filepath = os.path.join(features_table)
if not os.path.exists(filepath):
    os.makedirs(filepath)

# Set training set size
train_set_size = float(mlruncontext.get_param('TRAIN_SIZE', 0.7))
train_size = float(mlruncontext.get_param('TRAIN_SIZE', 0.7))

# Dask shards / CV
shards = int(mlruncontext.get_param('NUMBER_OF_SHARDS', 4))

# Create save-to folder if needed
model_filepath = os.path.join(mlruncontext.get_param('APP_DIR'),mlruncontext.get_param('SAVE_TO', '/v3io/bigdata/netops/models'))
if not os.path.exists(model_filepath):
    os.makedirs(model_filepath)

### Helper functions

In [None]:
def get_data_parquet():
    # Get parquet files
    mpath = [os.path.join(features_table, file) for file in os.listdir(features_table)]
    
    # Get latest filename
    latest = max(mpath, key=os.path.getmtime)
    print(latest)
    context.logger.debug('Reading data from: %s'%latest)
    
    # Load parquet to dask
    df = dd.read_parquet(latest)
    
    return df

In [None]:
def get_train_test_sets_from_data(df: pd.DataFrame):
    drop_columns = [col for col in df.columns if 'is_error' in col]
    X = df.drop(drop_columns, axis=1)
    y = df.loc[:, 'is_error']
    X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, train_size=train_size, test_size=1-train_size)
    return X_train, X_test, y_train, y_test

### Handler

In [None]:
def handler(context):
    # Get data
    df = get_data_parquet() 

    # Split to Train / Test datasets
    X_train, X_test, y_train, y_test = get_train_test_sets_from_data(df)
    
    # Train
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    # Score
    score = model.score(X_test, y_test)
    context.log_result('accuracy',score)
    # Save model
    model_file = os.path.join(context.get_param('APP_DIR'),context.get_param('SAVE_TO'),context.get_param('MODEL_FILENAME'))
    context.logger.info("Save model to %s"%model_file)
    context.log_artifact('trained_model.pickle',model_file)
    pickle.dump(model, open(model_file,'wb'))

## Test

In [None]:
#output = handler(context)
#output

## Deployment