### Training for Language Model

In [1]:
import nuclio

In [2]:
# nuclio: start-code

In [3]:
%nuclio config spec.image = "mlrun/ml-models-gpu"
%nuclio config kind = "job"

%nuclio: setting spec.image to 'mlrun/ml-models-gpu'
%nuclio: setting kind to 'job'


In [4]:
from os import path
import pandas as pd
import numpy as np
import random
import fastai
import json
from fastai.text import *
from fastai.callbacks import *
torch.cuda.set_device(0)

def get_min_grad(model):
    model.lr_find(num_it=200)
    model.recorder.plot(suggestion=True)
    return model.recorder.min_grad_lr

def train_lm_model(bs, drop_mult, epochs, num_samples, data_lm_path):
    # Load data
    version = f"sample_{num_samples}"
    data_lm = load_data("", str(data_lm_path), bs=bs)

    # Define metrics
    metrics = [accuracy]

    # Define model callbacks
    early_stop = partial(EarlyStoppingCallback,
                         monitor='accuracy',
                         mode="max",
                         min_delta=0.01,
                         patience=5)

    callback_fns = [early_stop]

    # Define language model
    learn_lm = language_model_learner(data_lm,
                                      AWD_LSTM,
                                      drop_mult=drop_mult,
                                      metrics=metrics,
                                      callback_fns=callback_fns).to_fp16()
    
    # Define training callback
    model_name = f"learn_lm_{version}_bs_{bs}_dropmult_{drop_mult}"
    model_enc_name = f"learn_lm_enc_{version}_bs_{bs}_dropmult_{drop_mult}"
    save_best = SaveModelCallback(learn_lm,
                                  every='improvement',
                                  monitor="accuracy",
                                  mode="max",
                                  name=model_name)
    callbacks = [save_best]

    # Train
    min_grad = get_min_grad(learn_lm)
    learn_lm.fit_one_cycle(1, min_grad, callbacks=callbacks)
    learn_lm.unfreeze()

    min_grad = get_min_grad(learn_lm)
    learn_lm.fit_one_cycle(epochs, slice(min_grad,min_grad*100), moms=(0.8,0.7), callbacks=callbacks)
    
    # Get accuracy
    best_acc = max([i[0] for i in learn_lm.recorder.metrics]).item()
    
    # Load best model
    learn_lm.load(model_name)
    
    return (best_acc, learn_lm)

def train_lm(context, train_lm_epochs, data_lm_path, num_samples, hyper_lm_best_params_path):
    
    # Load best hyperparameters and update epochs for training
    with open(str(hyper_lm_best_params_path) + "params.json", "r+") as f:
        params = json.load(f)
    params['epochs'] = int(str(train_lm_epochs))
    params['num_samples'] = int(str(num_samples))
    params['data_lm_path'] = str(data_lm_path)
    
    # Train model
    best_acc, learn_lm = train_lm_model(**params)
    
    # Save model and encodings
    model = "/User/nlp/run/train_lm_model"
    model_enc = "/User/nlp/run/train_lm_model_enc"
    learn_lm.save(model)
    learn_lm.save_encoder(model_enc)

    # Log outputs
    context.log_result('train_lm_model', model)
    context.log_result('train_lm_model_enc', model_enc)
    
    train_lm_accuracy_path = "/User/nlp/run/train_lm_accuracy.txt"
    with open(train_lm_accuracy_path, "w+") as f:
        f.write(str(best_acc))
    context.log_result('train_lm_accuracy', train_lm_accuracy_path)

In [5]:
# nuclio: end-code

In [6]:
from mlrun import mlconf
import os
from os import path

# Target location for storing pipeline artifacts
artifact_path = path.abspath('../jobs')
# MLRun DB path or API service URL
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

print(f'Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}')

Artifacts path: /User/nlp/components/jobs
MLRun DB path: http://mlrun-api:8080


In [7]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("train_lm")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_lm"
fn.spec.description = "training for language model"
fn.metadata.categories = ["train", "ml"]
fn.metadata.labels = {"author": "nschenone"}
fn.spec.resources["limits"] = {'nvidia.com/gpu' : 1}
fn.export("../yaml/train_lm.yaml")

> 2020-08-13 19:03:57,260 [info] function spec saved to path: ../yaml/train_lm.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f986c9d4080>