In [1]:
from tesseract import Tesseract, FileStore
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

ImportError: No module named matplotlib.pyplot

# Load the Data

In [2]:
# read in the iris data
iris = load_iris()

# create X (features) and y (response)
x = iris.data
y = iris.target

# Tesseract Setup

In [26]:
fs = FileStore("./local_file_store")
ts = Tesseract(fs, "http://localhost:8000")
ts.with_resources(cpu_cores=1, ram_gb=4, libraries=["cloudpickle", "numpy", "scipy", "scikit-learn"])

# Basic ML Example

In [27]:
# use train/test split with different random_state values
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
future = ts.run(knn.fit, x_train, y_train)
model = future.result()
y_pred = model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

0.973684210526


# K-fold Cross Validation

In [28]:
ts_cv = Tesseract(fs, "http://localhost:8000")
ts_cv.with_resources(docker="adamstruck/scikit-learn:latest")

In [29]:
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)

def train(knn, x_train, x_test, y_train, y_test):
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    return metrics.accuracy_score(y_test, y_pred)

kf = KFold(n_splits=10)
results = []

for train_index, test_index in kf.split(x):
    results.append(
        ts_cv.run(train, knn, x[train_index], x[test_index], y[train_index], y[test_index])
    )

print([r.result() for r in results])

[1.0, 1.0, 1.0, 1.0, 0.80000000000000004, 0.8666666666666667, 1.0, 0.8666666666666667, 0.80000000000000004, 1.0]
