# Defining and saving prediction function

This notebook should be run in a conda environment, `environment1`, with all the modules necessary to run this notebook and create the model within it installed.

For more info, check out the design document: https://docs.google.com/document/d/1V6mKEkl29phAUkVic0J1ySd0Vtqkdikb9cDVTwsaL-I

## Load in data

In [None]:
import pandas as pd
import numpy as np

# Taken from https://gist.github.com/dcrankshaw/f851ea2fee582f544288d36ae97ef86d
def load_digits(digits_location, digits_filename):
    digits_path = digits_location + "/" + digits_filename
    print "Source file:", digits_path
    df = pd.read_csv(digits_path, sep=",", header=None)
    data = df.values
    print "Number of image files:", len(data)
    y = data[:,0]
    X = data[:,1:]
    return (X, y)

def normalize_digits(X):
    print("Normalizing data")
    mu = np.mean(X,0)
    sigma = np.var(X,0)
    Z = (X - mu) / np.array([np.sqrt(z) if z > 0 else 1. for z in sigma])
    return Z

Provide the location of the MNIST data. It can be downloaded here: https://www.dropbox.com/s/n3jqkdg5ukx7ku5/mnist.zip?dl=0

In [None]:
digits_location = "" # Set this to path of the folder enclosing the .data files
train_data_fname = "train-mnist-dense-with-labels.data"

Load in the training data

In [None]:
train_x, train_y = load_digits(digits_location, train_data_fname)

## Select subset of data to train on

In [None]:
import random
NUM_DATAPOINTS = 1000
n, d = train_x.shape

rand_range = list(range(n))
random.shuffle(rand_range)

indices = np.array(rand_range[0:NUM_DATAPOINTS])

train_x_short = train_x[indices, :]
train_y_short = train_y[indices]

## Pre-process data and train model

In [None]:
from sklearn import linear_model as lm
def train_sklearn_model(m, train_x, train_y):
    m.fit(train_x, train_y)
    return m
normalized_training_x_short = normalize_digits(train_x_short)
lr_model = train_sklearn_model(lm.LogisticRegression(), normalized_training_x_short, train_y_short)

## Define prediction function

In [None]:
def predict(data):
    normalized_data = normalize_digits(data)
    return lr_model.predict(normalized_data)

## Seralize prediction function and export it to a text file

Serialize the predict function

In [None]:
from cStringIO import StringIO
from cloudpickle import CloudPickler

s = StringIO()
c = CloudPickler(s, 2)
c.dump(predict)
serialized_prediction_function = s.getvalue()

Write it to a file

In [None]:
serialized_function_file = open("serialized_function.txt", "w")
serialized_function_file.write(serialized_prediction_function)
serialized_function_file.close()

## Identify all installed packages and export them to a text file

Identify all installed python packages

In [None]:
import pip
installed_packages = pip.get_installed_distributions()
installed_packages_list = ["{name}=={version}".format(name=m.key, version=m.version) for m in installed_packages]

Write them to a file

In [None]:
installed_packages_str = '\n'.join(installed_packages_list)
dependencies_file = open("dependencies.txt", "w")
dependencies_file.write(installed_packages_str)
dependencies_file.close()