In [None]:
import tensorflowjs as tfjs

import os
import shutil
import tempfile
import json

import numpy as np
import pandas as pd

import mlflow
import mlflow.tensorflow
import mlflow.keras

from faculty import datasets

from google.cloud import storage

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras
layers = keras.layers
models = keras.models

In [None]:
mlflow.set_experiment("Article type")

In [None]:
def train_test_split(data, train_size):
    train = data[:train_size]
    test = data[train_size:]
    return train, test

In [None]:
def prepare_data(split, max_words=1000):
    data = pd.read_csv("/project/bbc-text.csv")
    
    tokenizer = keras.preprocessing.text.Tokenizer(
        num_words=max_words, 
        char_level=False
    )
    
    encoder = LabelEncoder()
    
    if split:
        train_size = int(len(data) * .8)
        print ("Train size: %d" % train_size)
        print ("Test size: %d" % (len(data) - train_size))
        # Split data
        train_cat, test_cat = train_test_split(data['category'], train_size)
        train_text, test_text = train_test_split(data['text'], train_size)
    else:
        print ("Train size: %d" % len(data))
        train_cat = data['category']
        train_text = data['text']
        
    # fit tokenizer to our training text data
    tokenizer.fit_on_texts(train_text) 
    x_train = tokenizer.texts_to_matrix(train_text)
    if split:
        x_test = tokenizer.texts_to_matrix(test_text)

    # Use sklearn utility to convert label strings to numbered index
    encoder.fit(train_cat)
    y_train = encoder.transform(train_cat)
    if split:
        y_test = encoder.transform(test_cat)

    # Converts the labels to a one-hot representation
    num_classes = np.max(y_train) + 1
    y_train = keras.utils.to_categorical(y_train, num_classes)
    if split:
        y_test = keras.utils.to_categorical(y_test, num_classes)
        
    if split:
        print('x_train shape:', x_train.shape)
        print('x_test shape:', x_test.shape)
        print('y_train shape:', y_train.shape)
        print('y_test shape:', y_test.shape)
        return x_train, x_test, y_train, y_test, tokenizer, encoder, num_classes
    else:
        print('x_train shape:', x_train.shape)
        print('y_train shape:', y_train.shape)
        return x_train, y_train, tokenizer, encoder, num_classes


In [None]:
def run_experiment(batch_size, epochs, drop_ratio, max_words=1000):
    with mlflow.start_run():
        # Log params
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("drop_ratio", drop_ratio)
        mlflow.log_param("max_words", max_words)

        # Prepare data
        x_train, x_test, y_train, y_test, \
            tokenizer, encoder, num_classes = prepare_data(True, max_words)
        
        # Build model
        model = models.Sequential()
        model.add(layers.Dense(512, input_shape=(max_words,)))
        model.add(layers.Activation('relu'))
        model.add(layers.Dropout(drop_ratio))
        model.add(layers.Dense(num_classes))
        model.add(layers.Activation('softmax'))
        model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy']
        )
        
        # Fit model
        history = model.fit(
            x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=0,
            validation_split=0.1
        )
        
        # Evaluate model
        score = model.evaluate(
            x_test, y_test,
            batch_size=batch_size,
            verbose=0
        )
        
        # Log metrics
        mlflow.log_metric("loss", score[0])
        mlflow.log_metric("accuracy", score[1])
        
        # Save python model
        mlflow.set_tag("python_model", "yes")
        mlflow.keras.log_model(model, "model/python")
        
        # Save javascript model
        mlflow.set_tag("javascript_model", "yes")
        shutil.rmtree("/tmp/js-model", ignore_errors=True)
        tfjs.converters.save_keras_model(model, "/tmp/js-model")
        mlflow.log_artifacts("/tmp/js-model", "model/javascript")
        
        # Save bag of words
        if os.path.exists("/tmp/bag_of_words.json"):
            os.remove("/tmp/bag_of_words.json")
        with open("/tmp/bag_of_words.json", "w") as f:
            json.dump(tokenizer.word_index, f)
        mlflow.log_artifact("/tmp/bag_of_words.json")
        
        # Save class labels
        if os.path.exists("/tmp/label_classes.json"):
            os.remove("/tmp/label_classes.json")
        with open("/tmp/label_classes.json", "w") as f:  
            json.dump(list(encoder.classes_), f)
        mlflow.log_artifact("/tmp/label_classes.json")
        
        return model

In [None]:
batch_size = 16
epochs = 4
drop_ratio = 0.4
max_words = 1000
model = run_experiment(batch_size, epochs, drop_ratio, max_words)

In [None]:
experiment_id = 1
bucket_name = "faculty-models"
def upload_to_gcloud(run_id):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    for path in [
        "bag_of_words.json",
        "label_classes.json",
        "model/javascript/model.json",
    ]:
        name = os.path.basename(path)
        local_path = f"/tmp/{name}"
        if os.path.exists(local_path):
            os.remove(local_path)
        datasets.get(
            f".mlflow-artifacts/{experiment_id}/{run_id}/{path}",
            local_path
        )
        blob = bucket.blob(name)
        blob.upload_from_filename(local_path)
        blob.make_public()

In [None]:
upload_to_gcloud("352c0653-9dff-4e6b-acc1-15499e40190b")