# IMPORT

In [1]:
import tempfile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

# LOAD DATA, TRAIN MODEL

In [2]:
%%bash

if [ ! -d ./data ]; then
    mkdir ./data
fi

if [ ! -f ./data/trainingandtestdata.zip ]; then
    wget -q -O ./data/trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
fi

unzip -n ./data/trainingandtestdata.zip -d ./data

Archive:  ./data/trainingandtestdata.zip


In [3]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
df_train_file_path = './data/training.1600000.processed.noemoticon.csv'
df_train_name = 'training.1600000.processed.noemoticon'
df_train = pd.read_csv(df_train_file_path,
                      header=None,
                      encoding='ISO-8859-1')

df_test_file_path = './data/testdata.manual.2009.06.14.csv'
df_test_name = 'testdata.manual.2009.06.14'
df_test = pd.read_csv(df_test_file_path,
                     header=None,
                     encoding='ISO-8859-1')
df_train.columns = columns
df_test.columns = columns

In [4]:
sentiment_lr = Pipeline([
                         ('count_vect', CountVectorizer(min_df=100,
                                                        ngram_range=(1,2),
                                                        stop_words='english')), 
                         ('lr', LogisticRegression())])
sentiment_lr.fit(df_train.text, df_train.polarity)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('count_vect',
                 CountVectorizer(min_df=100, ngram_range=(1, 2),
                                 stop_words='english')),
                ('lr', LogisticRegression())])

In [5]:
x_test, y_test = df_test.text[df_test.polarity != 2], df_test.polarity[df_test.polarity != 2]
print(classification_report(y_test, sentiment_lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       177
           4       0.83      0.88      0.86       182

    accuracy                           0.85       359
   macro avg       0.85      0.85      0.85       359
weighted avg       0.85      0.85      0.85       359



In [6]:
sentiment_lr.predict([x_test[0]])

array([4])

In [7]:
sentiment_lr.predict(['good', 'bad'])

array([4, 0])

# UNBOX

In [8]:
import unboxapi
client = unboxapi.UnboxClient(email='me@vikasnair.com', password='00000000')

## Create function

In [9]:
# class_dict = { 4: 'positive', 2: 'neutral', 0: 'negative'}
class_names = ['negative', 'positive']
dataset_labels = [0, 4]
def predict_function(model, text_list):
    return model.predict_proba(text_list), class_names, dataset_labels

In [10]:
texts = ['some new text, sweet noodles', 'happy time', 'sad day']

predict_function(sentiment_lr, texts)

(array([[0.15715401, 0.84284599],
        [0.20192416, 0.79807584],
        [0.97165564, 0.02834436]]),
 ['negative', 'positive'],
 [0, 4])

# Package (function, model) & Upload to Firebase

In [11]:
print('Uploading model...')
client.add_model(function=predict_function, 
                 model=sentiment_lr, 
                 name='03.11.2021.sentiment_analyzer',
                 description='this is my sklearn sentiment model')
print('Complete.')

Uploading model...
[2021-03-11 18:34:13,920] INFO - Detected non-PyPI-released BentoML installed, copying local BentoML modulefiles to target saved bundle path..


no previously-included directories found matching 'e2e_tests'
no previously-included directories found matching 'tests'
no previously-included directories found matching 'benchmark'


UPDATING BentoML-0.11.0+33.g7e83376/bentoml/_version.py
set BentoML-0.11.0+33.g7e83376/bentoml/_version.py to '0.11.0+33.g7e83376'
Complete.


In [12]:
print('\nUploading dataset (from file)...')
with tempfile.TemporaryDirectory() as tmpdir:
    df_train.to_csv(f'{tmpdir}/dataset.csv')
    response_i = client.add_dataset(file_path=f'{tmpdir}/dataset.csv', 
                                    name=df_train_name, 
                                    description='this is my sentiment train dataset',
                                    label_column_name='polarity',
                                    text_column_name='text')
print(f'Complete. Response: {response_i}')


Uploading dataset (from file)...
Complete. Response: {'columns': ['Unnamed: 0', 'polarity', 'tweetid', 'date', 'query_name', 'user', 'text'], 'datasetID': '89b0098a-82db-11eb-9127-1e008a21a079', 'description': 'this is my sentiment train dataset', 'labelColumnName': 'polarity', 'name': 'training.1600000.processed.noemoticon', 'otherColumns': ['Unnamed: 0', 'polarity', 'tweetid'], 'textColumnName': 'text', 'textColumns': ['date', 'query_name', 'user', 'text']}


In [6]:
print('\nUploading dataset (from data frame)...')
response_j = client.add_dataframe(df=df_test, 
                                  name=df_test_name, 
                                  description='this is my sentiment test dataset',
                                  label_column_name='polarity',
                                  text_column_name='text')
print(f'Complete. Response: {response_j}')


Uploading dataset (from data frame)...
Complete. Response: {'datasetID': '68ac3c44-811a-11eb-9fe3-1e008a21a079', 'metadata': {'columns': ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text'], 'description': 'this is my sentiment test dataset', 'label_column_name': 'polarity', 'name': 'testdata.manual.2009.06.14', 'other_columns': ['polarity', 'tweetid'], 'text_column_name': 'text', 'text_columns': ['date', 'query_name', 'user', 'text']}}


In [15]:
# Test error analysis / dataset-model association
client._test_associate(model_id='6fd90796-82db-11eb-9127-1e008a21a079', 
                       dataset_id='89b0098a-82db-11eb-9127-1e008a21a079')

<Response [201]>
