# IMPORT

In [5]:
import tempfile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

# LOAD DATA, TRAIN MODEL

In [6]:
%%bash

if [ ! -d ./data ]; then
    mkdir ./data
fi

if [ ! -f ./data/trainingandtestdata.zip ]; then
    wget -q -O ./data/trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
fi

unzip -n ./data/trainingandtestdata.zip -d ./data

Archive:  ./data/trainingandtestdata.zip


In [7]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
df_train_file_path = './data/training.1600000.processed.noemoticon.csv'
df_train_name = 'training.1600000.processed.noemoticon'
df_train = pd.read_csv(df_train_file_path,
                      header=None,
                      encoding='ISO-8859-1')

df_test_file_path = './data/testdata.manual.2009.06.14.csv'
df_test_name = 'testdata.manual.2009.06.14'
df_test = pd.read_csv(df_test_file_path,
                     header=None,
                     encoding='ISO-8859-1')
df_train.columns = columns
df_test.columns = columns
df_train.shape[0]

1600000

In [8]:
sentiment_lr = Pipeline([
                         ('count_vect', CountVectorizer(min_df=100,
                                                        ngram_range=(1,2),
                                                        stop_words='english')), 
                         ('lr', LogisticRegression())])
sentiment_lr.fit(df_train.text, df_train.polarity)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('count_vect',
                 CountVectorizer(min_df=100, ngram_range=(1, 2),
                                 stop_words='english')),
                ('lr', LogisticRegression())])

In [9]:
x_test, y_test = df_test.text[df_test.polarity != 2], df_test.polarity[df_test.polarity != 2]
print(classification_report(y_test, sentiment_lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       177
           4       0.83      0.88      0.86       182

    accuracy                           0.85       359
   macro avg       0.85      0.85      0.85       359
weighted avg       0.85      0.85      0.85       359



In [10]:
sentiment_lr.predict([x_test[0]])

array([4])

In [11]:
sentiment_lr.predict(['good', 'bad'])

array([4, 0])

# UNBOX

In [65]:
import unboxapi
client = unboxapi.UnboxClient(email="rishramanathan@gmail.com", password="00000000")

## Create function

In [13]:
# class_dict = { 4: 'positive', 2: 'neutral', 0: 'negative'}
class_names = ['negative', 'positive']
dataset_labels = [0, 4]
def predict_function(model, text_list):
    return model.predict_proba(text_list), class_names, dataset_labels

In [14]:
texts = ['some new text, sweet noodles', 'happy time', 'sad day']

predict_function(sentiment_lr, texts)

(array([[0.15715401, 0.84284599],
        [0.20192416, 0.79807584],
        [0.97165564, 0.02834436]]),
 ['negative', 'positive'],
 [0, 4])

# Package (function, model) & Upload to Unbox Server

In [15]:
print('Uploading model...')
response = client.add_model(function=predict_function, 
                 model=sentiment_lr, 
                 name='03.15.2021.sentiment_analyzer',
                 description='this is my sklearn sentiment model')
print(f'Complete. Response:')
response.json()

Uploading model...
Packaged bento content
Connecting to Unbox server
Complete. Response: {'_links': {'datasets': '/api/models/c24012ea-0485-4aa3-be17-a027e9841962/datasets', 'inferenceRuns': '/api/models/c24012ea-0485-4aa3-be17-a027e9841962/inference-runs', 'runReports': '/api/models/c24012ea-0485-4aa3-be17-a027e9841962/run-reports', 'self': '/api/models/c24012ea-0485-4aa3-be17-a027e9841962'}, 'datasetCount': 0, 'dateCreated': '2021-03-27T22:54:37.093992Z', 'description': 'this is my sklearn sentiment model', 'id': 'c24012ea-0485-4aa3-be17-a027e9841962', 'inferenceRunCount': 0, 'modelApi': None, 'name': '03.15.2021.sentiment_analyzer', 'runReportCount': 0}


In [88]:
print('\nUploading dataset (from file)...')
with tempfile.TemporaryDirectory() as tmpdir:
    df_train.to_csv(f'{tmpdir}/dataset.csv')
    response = client.add_dataset(file_path=f'{tmpdir}/dataset.csv', 
                                  name=df_train_name, 
                                  description='this is my sentiment train dataset',
                                  label_column_name='polarity',
                                  text_column_name='text')
print(f'Complete. Response:')
response.json()


Uploading dataset (from file)...
Complete. Response: {'_links': {'models': '/api/datasets/8/models', 'rows': '/api/datasets/8/rows', 'self': '/api/datasets/8', 'tags': '/api/datasets/8/tags'}, 'dateCreated': '2021-03-22T07:32:16.416655Z', 'description': 'this is my sentiment train dataset', 'id': 8, 'labelColumn': 'polarity', 'modelCount': 0, 'name': 'training.1600000.processed.noemoticon', 'tagCount': 0, 'textColumn': 'text'}


In [16]:
print('\nUploading dataset (from data frame)...')
response = client.add_dataframe(df=df_test, 
                                name=df_test_name, 
                                description='this is my sentiment test dataset',
                                label_column_name='polarity',
                                text_column_name='text')
print(f'Complete. Response:')
response.json()


Uploading dataset (from data frame)...
Complete. Response: {'_links': {'models': '/api/datasets/1/models', 'rows': '/api/datasets/1/rows', 'self': '/api/datasets/1', 'tags': '/api/datasets/1/tags'}, 'dateCreated': '2021-03-27T22:54:51.026300Z', 'description': 'this is my sentiment test dataset', 'id': 1, 'labelColumn': 'polarity', 'modelCount': 0, 'name': 'testdata.manual.2009.06.14', 'tagCount': 0, 'textColumn': 'text'}
