# IMPORT

In [1]:
import tempfile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

# LOAD DATA, TRAIN MODEL

In [2]:
%%bash

if [ ! -d ./data ]; then
    mkdir ./data
fi

if [ ! -f ./data/trainingandtestdata.zip ]; then
    wget -q -O ./data/trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
fi

unzip -n ./data/trainingandtestdata.zip -d ./data

Archive:  ./data/trainingandtestdata.zip


In [3]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
df_train_file_path = './data/training.1600000.processed.noemoticon.csv'
df_train_name = 'training.1600000.processed.noemoticon'
df_train = pd.read_csv(df_train_file_path,
                      header=None,
                      encoding='ISO-8859-1')

df_test_file_path = './data/testdata.manual.2009.06.14.csv'
df_test_name = 'testdata.manual.2009.06.14'
df_test = pd.read_csv(df_test_file_path,
                     header=None,
                     encoding='ISO-8859-1')
df_train.columns = columns
df_test.columns = columns

In [4]:
import random
# Remove 'neutral' since it isn't in training dataset
df_test['polarity'] = df_test['polarity'].replace(2, random.choice([0, 4]))
# Make labels monotonically increasing [0,1]
df_test['polarity'] = df_test['polarity'].replace(4, 1)
df_train['polarity'] = df_train['polarity'].replace(4, 1)

In [5]:
sentiment_lr = Pipeline([
                         ('count_vect', CountVectorizer(min_df=100,
                                                        ngram_range=(1,2),
                                                        stop_words='english')), 
                         ('lr', LogisticRegression())])
sentiment_lr.fit(df_train.text, df_train.polarity)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('count_vect',
                 CountVectorizer(min_df=100, ngram_range=(1, 2),
                                 stop_words='english')),
                ('lr', LogisticRegression())])

In [6]:
x_test, y_test = df_test.text, df_test.polarity
print(classification_report(y_test, sentiment_lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.89      0.54      0.67       316
           1       0.52      0.88      0.66       182

    accuracy                           0.66       498
   macro avg       0.71      0.71      0.66       498
weighted avg       0.75      0.66      0.67       498



In [7]:
sentiment_lr.predict([x_test[0]])

array([1])

In [8]:
sentiment_lr.predict(['good', 'bad'])

array([1, 0])

# UNBOX

In [9]:
import unboxapi
# client = unboxapi.UnboxClient("YOUR_API_KEY_HERE")
client = unboxapi.UnboxClient("ebcc2698-d638-11eb-938e-1e008a21a07a")
unboxapi.api.UNBOX_ENDPOINT = "http://0.0.0.0:8080/api" # Use this for local testing

## Create function

In [10]:
def predict_function(model, text_list):
    return model.predict_proba(text_list)

In [11]:
texts = ['some new text, sweet noodles', 'happy time', 'sad day']

predict_function(sentiment_lr, texts)

array([[0.15715401, 0.84284599],
       [0.20192416, 0.79807584],
       [0.97165564, 0.02834436]])

# Package & Upload to Unbox

### Upload dataset from dataframe

In [None]:
dataset = client.add_dataframe(
    df=df_test,
    class_names=['negative', 'positive'],
    label_column_name='polarity',
    text_column_name='text',
    name=df_test_name,
    description='this is my sentiment test dataset'
)
dataset.to_dict()

### Upload model

In [12]:
from unboxapi.models import ModelType

model = client.add_model(
    function=predict_function, 
    model=sentiment_lr,
    model_type=ModelType.sklearn,
    class_names=['negative', 'positive'],
    name='05.15.2021.sentiment_analyzer',
    description='this is my sklearn sentiment model'
)
model.to_dict()

Connecting to Unbox server


100%|██████████| 55.4M/55.4M [00:16<00:00, 3.53MB/s]


{'_links': {'datasets': '/api/models/9043be49-a64a-41d1-9fa2-d61c7f32441b/datasets',
  'inferenceRuns': '/api/models/9043be49-a64a-41d1-9fa2-d61c7f32441b/inference-runs',
  'runReports': '/api/models/9043be49-a64a-41d1-9fa2-d61c7f32441b/run-reports',
  'self': '/api/models/9043be49-a64a-41d1-9fa2-d61c7f32441b'},
 'classNames': ['negative', 'positive'],
 'datasetCount': 0,
 'dateCreated': '2021-06-27T01:37:03.332631Z',
 'description': 'this is my sklearn sentiment model',
 'id': '9043be49-a64a-41d1-9fa2-d61c7f32441b',
 'inProgressNotifications': ['model_9043be49-a64a-41d1-9fa2-d61c7f32441b_create_endpoint'],
 'inferenceRunCount': 0,
 'modelApi': None,
 'name': '05.15.2021.sentiment_analyzer',
 'runReportCount': 0}

### Upload dataset from csv

In [None]:
print('\nUploading dataset (from file)...')
with tempfile.TemporaryDirectory() as tmpdir:
    df_train.to_csv(f'{tmpdir}/dataset.csv')
    dataset = client.add_dataset(
        file_path=f'{tmpdir}/dataset.csv', 
        class_names=['negative', 'positive'],
        label_column_name='polarity',
        text_column_name='text',
        name=df_train_name,
        description='this is my sentiment train dataset',
    )
dataset.to_dict()