## Build model

In [1]:
import tempfile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

In [2]:
%%bash

if [ ! -d ./data ]; then
    mkdir ./data
fi

if [ ! -f ./data/trainingandtestdata.zip ]; then
    wget -q -O ./data/trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
fi

unzip -n ./data/trainingandtestdata.zip -d ./data

Archive:  ./data/trainingandtestdata.zip


In [3]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
df_train_file_path = './data/training.1600000.processed.noemoticon.csv'
df_train_name = 'training.1600000.processed.noemoticon'
df_train = pd.read_csv(df_train_file_path,
                      header=None,
                      encoding='ISO-8859-1')

df_test_file_path = './data/testdata.manual.2009.06.14.csv'
df_test_name = 'testdata.manual.2009.06.14'
df_test = pd.read_csv(df_test_file_path,
                     header=None,
                     encoding='ISO-8859-1')
df_train.columns = columns
df_test.columns = columns
df_train.shape[0]

df_train = df_train.sample(frac=1)[:160000]

In [4]:
sentiment_lr = Pipeline([
                         ('count_vect', CountVectorizer(min_df=100,
                                                        ngram_range=(1,2),
                                                        stop_words='english')), 
                         ('lr', LogisticRegression())])
#sentiment_lr.fit(df_train.text, df_train.polarity)
sentiment_lr.fit(df_train.text, df_train.polarity)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('count_vect',
                 CountVectorizer(min_df=100, ngram_range=(1, 2),
                                 stop_words='english')),
                ('lr', LogisticRegression())])

In [5]:
x_test, y_test = df_test.text[df_test.polarity != 2], df_test.polarity[df_test.polarity != 2]
print(classification_report(y_test, sentiment_lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.84      0.71      0.77       177
           4       0.75      0.87      0.81       182

    accuracy                           0.79       359
   macro avg       0.80      0.79      0.79       359
weighted avg       0.80      0.79      0.79       359



In [6]:
sentiment_lr.predict([x_test[0]])

array([4])

In [7]:
sentiment_lr.predict(['good', 'bad'])

array([4, 0])

In [8]:
from predictor import predict_func
def predict_function(model, text_list):
    return predict_func(model, text_list)

In [9]:
texts = ['some new text, sweet noodles', 'happy time', 'sad day']
predict_function(sentiment_lr, texts)

(array([[0.16322228, 0.83677772],
        [0.20672419, 0.79327581],
        [0.94389994, 0.05610006]]),
 ['negative', 'positive'],
 [0, 4])

### Predict function and model

In [10]:
predict_function(sentiment_lr, ["great"])

(array([[0.26012889, 0.73987111]]), ['negative', 'positive'], [0, 4])

# Unbox

### Pack model

In [11]:
import unboxapi
client = unboxapi.UnboxClient()

User is not logged in.


In [13]:
saved_path = client.pack_model(
    function=predict_function, 
    model=sentiment_lr,
    model_name="SklearnModel",
    local_imports=["predictor"]
)

[2021-05-12 12:21:47,523] INFO - BentoService bundle 'SklearnModel:20210512122144_1D24FF' saved to: /Users/gbayomi/bentoml/repository/SklearnModel/20210512122144_1D24FF


### Load

In [14]:
import bentoml
from pandas import DataFrame

In [15]:
bento_model = bentoml.load(saved_path)



### Test locally

In [16]:
bento_model.predict([{"text": "great"}])

[{'negative': 0.26012888950640967, 'positive': 0.7398711104935903}]

In [17]:
bento_model.predictbatch([{"batch": ["great", "terrible"]}])

[[{'negative': 0.26012888950640967, 'positive': 0.7398711104935903},
  {'negative': 0.7937340674938305, 'positive': 0.2062659325061696}]]

In [18]:
bento_model.predictactive([{"batch": ["great", "terrible"], "n_instances": 2}])

[([0, 1], ['great', 'terrible'])]

### Test as an endpoint

In [19]:
!bentoml run SklearnModel:latest predict --input '{"text": "Which baking dish is best to bake a banana bread ?"}'

[2021-05-12 12:22:20,313] INFO - Getting latest version SklearnModel:20210512122144_1D24FF
[2021-05-12 12:22:23,588] INFO - {'service_name': 'SklearnModel', 'service_version': '20210512122144_1D24FF', 'api': 'predict', 'task': {'data': '{"text": "Which baking dish is best to bake a banana bread ?"}', 'task_id': '535c28ee-03aa-4182-a6bd-233ff86186eb', 'cli_args': ('--input', '{"text": "Which baking dish is best to bake a banana bread ?"}'), 'inference_job_args': {}}, 'result': {'data': '{"negative": 0.2729064092809028, "positive": 0.7270935907190972}', 'http_status': 200, 'http_headers': (('Content-Type', 'application/json'),)}, 'request_id': '535c28ee-03aa-4182-a6bd-233ff86186eb'}
{"negative": 0.2729064092809028, "positive": 0.7270935907190972}


In [20]:
!bentoml run SklearnModel:latest predictbatch --input '{"batch": ["great", "terrible"]}'

[2021-05-12 12:22:27,228] INFO - Getting latest version SklearnModel:20210512122144_1D24FF
[2021-05-12 12:22:30,532] INFO - {'service_name': 'SklearnModel', 'service_version': '20210512122144_1D24FF', 'api': 'predictbatch', 'task': {'data': '{"batch": ["great", "terrible"]}', 'task_id': '1c16c044-c495-4e45-8a6b-46dee3be1eaa', 'cli_args': ('--input', '{"batch": ["great", "terrible"]}'), 'inference_job_args': {}}, 'result': {'data': '[{"negative": 0.26012888950640967, "positive": 0.7398711104935903}, {"negative": 0.7937340674938305, "positive": 0.2062659325061696}]', 'http_status': 200, 'http_headers': (('Content-Type', 'application/json'),)}, 'request_id': '1c16c044-c495-4e45-8a6b-46dee3be1eaa'}
[{"negative": 0.26012888950640967, "positive": 0.7398711104935903}, {"negative": 0.7937340674938305, "positive": 0.2062659325061696}]


In [21]:
!bentoml run SklearnModel:latest predictactive --input '{"batch": ["great", "terrible"], "n_instances": 2}'

[2021-05-12 12:22:34,146] INFO - Getting latest version SklearnModel:20210512122144_1D24FF
[2021-05-12 12:22:37,368] INFO - {'service_name': 'SklearnModel', 'service_version': '20210512122144_1D24FF', 'api': 'predictactive', 'task': {'data': '{"batch": ["great", "terrible"], "n_instances": 2}', 'task_id': '3d3b1920-d906-4e6a-99b1-5e194f52a9e9', 'cli_args': ('--input', '{"batch": ["great", "terrible"], "n_instances": 2}'), 'inference_job_args': {}}, 'result': {'data': '[[0, 1], ["great", "terrible"]]', 'http_status': 200, 'http_headers': (('Content-Type', 'application/json'),)}, 'request_id': '3d3b1920-d906-4e6a-99b1-5e194f52a9e9'}
[[0, 1], ["great", "terrible"]]
