# IMPORT

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

# LOAD DATA, TRAIN MODEL

In [5]:
%%bash

if [ ! -f ./data/trainingandtestdata.zip ]; then
    wget -q http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
    unzip -n ./data/trainingandtestdata.zip
fi

In [6]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
dftrain = pd.read_csv('./data/training.1600000.processed.noemoticon.csv',
                      header = None,
                      encoding ='ISO-8859-1')
dftest = pd.read_csv('./data/testdata.manual.2009.06.14.csv',
                     header = None,
                     encoding ='ISO-8859-1')
dftrain.columns = columns
dftest.columns = columns

In [7]:
sentiment_lr = Pipeline([
                         ('count_vect', CountVectorizer(min_df = 100,
                                                        ngram_range = (1,2),
                                                        stop_words = 'english')), 
                         ('lr', LogisticRegression())])
sentiment_lr.fit(dftrain.text, dftrain.polarity)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('count_vect',
                 CountVectorizer(min_df=100, ngram_range=(1, 2),
                                 stop_words='english')),
                ('lr', LogisticRegression())])

In [8]:
Xtest, ytest = dftest.text[dftest.polarity!=2], dftest.polarity[dftest.polarity!=2]
print(classification_report(ytest,sentiment_lr.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       177
           4       0.83      0.88      0.86       182

    accuracy                           0.85       359
   macro avg       0.85      0.85      0.85       359
weighted avg       0.85      0.85      0.85       359



In [9]:
sentiment_lr.predict([Xtest[0]])

array([4])

In [10]:
sentiment_lr.predict(["good", "bad"])

array([4, 0])

# UNBOX

In [11]:
import unboxapi
client = unboxapi.UnboxClient('YOUR_API_KEY_HERE')

## Create function

In [12]:
class_dict = {4: "positive", 0: "negative", 2: "neutral"}
def predict_function(model, text_list):
    return [class_dict[d] for d in model.predict(text_list)]

In [13]:
texts = ["some new text, sweet noodles", "happy time", "sad day"]

predict_function(sentiment_lr, texts)

['positive', 'positive', 'negative']

# Package (function, model) & Upload to Firebase

In [14]:
client.add(
    function=predict_function,
    model=sentiment_lr
)

[2021-02-22 21:43:02,885] INFO - Saving bento to an remote path. BentoML will first save the bento to a local temporary directory and then upload to the remote path.
[2021-02-22 21:44:22,618] INFO - BentoService bundle 'TemplateModel:20210222214302_219D66' created at: gs://unbox-ai.appspot.com/Tg0T6QuoZ5baPz5Hbw3fvz8tvGw1/models
