# IMPORT

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

import bentoml

# LOAD DATA, TRAIN MODEL

In [2]:
%%bash

if [ ! -f ./trainingandtestdata.zip ]; then
    wget -q http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
    unzip -n trainingandtestdata.zip
fi

Archive:  trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


In [3]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
dftrain = pd.read_csv('training.1600000.processed.noemoticon.csv',
                      header = None,
                      encoding ='ISO-8859-1')
dftest = pd.read_csv('testdata.manual.2009.06.14.csv',
                     header = None,
                     encoding ='ISO-8859-1')
dftrain.columns = columns
dftest.columns = columns

In [4]:
sentiment_lr = Pipeline([
                         ('count_vect', CountVectorizer(min_df = 100,
                                                        ngram_range = (1,2),
                                                        stop_words = 'english')), 
                         ('lr', LogisticRegression())])
sentiment_lr.fit(dftrain.text, dftrain.polarity)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('count_vect',
                 CountVectorizer(min_df=100, ngram_range=(1, 2),
                                 stop_words='english')),
                ('lr', LogisticRegression())])

In [5]:
Xtest, ytest = dftest.text[dftest.polarity!=2], dftest.polarity[dftest.polarity!=2]
print(classification_report(ytest,sentiment_lr.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       177
           4       0.83      0.88      0.86       182

    accuracy                           0.85       359
   macro avg       0.85      0.85      0.85       359
weighted avg       0.85      0.85      0.85       359



In [6]:
sentiment_lr.predict([Xtest[0]])

array([4])

In [7]:
sentiment_lr.predict(["good", "bad"])

array([4, 0])

# UNBOX

In [8]:
import unbox

## Create function

In [9]:
class_dict = {4: "positive", 0: "negative", 2: "neutral"}
def predict_function(model, text_list):
    return [class_dict[d] for d in model.predict(text_list)]

In [10]:
texts = ["some new text, sweet noodles", "happy time", "sad day"]

predict_function(sentiment_lr, texts)

['positive', 'positive', 'negative']

# Package function and model

In [11]:
version, model_name = unbox.sklearn.add(
    name="hello",
    function=predict_function,
    model=sentiment_lr,
    inputs="text",
    location="my_models/sentiment/"
)

[2021-02-19 19:50:15,513] INFO - BentoService bundle 'SklearnTextTemplateModel:20210219194956_9DAFD1' saved to: /Users/gbayomi/bentoml/repository/SklearnTextTemplateModel/20210219194956_9DAFD1


In [12]:
version, model_name

('20210219194956_9DAFD1', 'SklearnTextTemplateModel')

In [None]:
#unbox.commit(model_name)
!bentoml serve SklearnTextTemplateModel:latest

In [None]:
#unbox.push(version)
!bentoml lambda deploy unbox-lambda-deploy SklearnTextTemplateModel:20210219194956_9DAFD1