### Create data bunches

In [1]:
# nuclio: ignore
import nuclio

In [2]:
# nuclio: start-code

In [3]:
%nuclio config spec.image = "mlrun/ml-models-gpu"
%nuclio config kind = "job"

%nuclio: setting spec.image to 'mlrun/ml-models-gpu'
%nuclio: setting kind to 'job'


In [4]:
from os import path
import pandas as pd
import numpy as np
import random
import fastai
from fastai.text import *
from fastai.callbacks import *
from pickle import dumps

# Create fastai data bunches    
def create_data_bunches(context, data_path, split):
    split = float(str(split))
    # Create Language Model DataBunch
    print(f"Creating Language Model DataBunch")
    df_sample = pd.read_pickle(str(data_path))

    X = df_sample[:int(len(df_sample)*split)]
    y = df_sample[int(len(df_sample)*(1-split)):]

    data_out_path = '/User/nlp/run/data_lm.pkl'
    data_lm = TextLMDataBunch.from_df(path="", train_df=X, valid_df=y)
    data_lm.save(data_out_path)
    context.logger.info(f'Saving data_lm to {data_out_path} ...')
    context.log_result(key='data_lm', value=data_out_path)

    
    # Create Classification DataBunch
    print(f"Creating Classification DataBunch")
    df_sample.drop("title", axis=1, inplace=True)
    df_sample.columns = ['text', 'target']
    df_sample = df_sample[['target', 'text']]

    X = df_sample[:int(len(df_sample)*split)]
    y = df_sample[int(len(df_sample)*(1-split)):]

    data_out_path = '/User/nlp/run/data_clas.pkl'
    data_clas = TextClasDataBunch.from_df(path="", train_df=X, valid_df=y, vocab=data_lm.vocab)
    data_clas.save(data_out_path)
    
    context.logger.info(f'Saving data_clas to {data_out_path} ...')
    context.log_result(key='data_clas', value=data_out_path)

In [5]:
# nuclio: end-code

In [6]:
from mlrun import mlconf
import os
from os import path

# Target location for storing pipeline artifacts
artifact_path = path.abspath('../jobs')
# MLRun DB path or API service URL
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

print(f'Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}')

Artifacts path: /User/nlp/components/jobs
MLRun DB path: http://mlrun-api:8080


In [7]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("create_data_bunches")

# add metadata (for templates and reuse)
fn.spec.default_handler = "create_data_bunches"
fn.spec.description = "create fastai data bunches"
fn.metadata.categories = ["data-source", "ml"]
fn.metadata.labels = {"author": "nschenone"}
fn.export("../yaml/create_data_bunches.yaml")

> 2020-08-13 18:58:54,193 [info] function spec saved to path: ../yaml/create_data_bunches.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f8e40c9fb00>