# IMPORT

In [1]:
import tempfile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

# LOAD DATA, TRAIN MODEL

In [2]:
%%bash

if [ ! -d ./data ]; then
    mkdir ./data
fi

if [ ! -f ./data/trainingandtestdata.zip ]; then
    wget -q -O ./data/trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
fi

unzip -n ./data/trainingandtestdata.zip -d ./data

Archive:  ./data/trainingandtestdata.zip


In [3]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
df_train_file_path = './data/training.1600000.processed.noemoticon.csv'
df_train_name = 'training.1600000.processed.noemoticon'
df_train = pd.read_csv(df_train_file_path,
                      header=None,
                      encoding='ISO-8859-1')

df_test_file_path = './data/testdata.manual.2009.06.14.csv'
df_test_name = 'testdata.manual.2009.06.14'
df_test = pd.read_csv(df_test_file_path,
                     header=None,
                     encoding='ISO-8859-1')
df_train.columns = columns
df_test.columns = columns

In [4]:
import random
# Remove 'neutral' since it isn't in training dataset
df_test['polarity'] = df_test['polarity'].replace(2, random.choice([0, 4]))
# Make labels monotonically increasing [0,1]
df_test['polarity'] = df_test['polarity'].replace(4, 1)
df_train['polarity'] = df_train['polarity'].replace(4, 1)

In [None]:
sentiment_lr = Pipeline(
    [
        (
            "count_vect",
            CountVectorizer(min_df=100, ngram_range=(1, 2), stop_words="english"),
        ),
        ("lr", LogisticRegression()),
    ]
)
sentiment_lr.fit(df_train.text, df_train.polarity)

In [None]:
x_test, y_test = df_test.text, df_test.polarity
print(classification_report(y_test, sentiment_lr.predict(x_test)))

In [None]:
sentiment_lr.predict([x_test[0]])

In [None]:
sentiment_lr.predict(['good', 'bad'])

# UNBOX

In [None]:
import unboxapi
client = unboxapi.UnboxClient("YOUR_API_KEY_HERE")

## Create function

In [None]:
def predict_function(model, text_list):
    return model.predict_proba(text_list)

In [None]:
texts = ['some new text, sweet noodles', 'happy time', 'sad day']

predict_function(sentiment_lr, ['good', 'bad'])

# Package & Upload to Unbox

## Create/Load Project

### Create Project

In [None]:
# Comment this out and uncomment the next section to load the project
project = client.create_project(
    name="Sentiment Analysis Sklearn Model",
    description="Project to Demo Sklearn Sentiment Analysis with Unbox"
)

# Use this for loading the project on subsequent runs
'''
project = client.load_project(
    name="Sentiment Analysis Sklearn Model"
)
'''

### Upload dataset from dataframe

In [None]:
dataset = project.add_dataframe(
    df=df_test,
    class_names=['negative', 'positive'],
    label_column_name='polarity',
    text_column_name='text',
    name=df_test_name,
    description='this is my sentiment test dataset'
)
dataset.to_dict()

### Upload model

In [None]:
from unboxapi.models import ModelType

model = project.add_model(
    function=predict_function, 
    model=sentiment_lr,
    model_type=ModelType.sklearn,
    class_names=['negative', 'positive'],
    name='05.15.2021.sentiment_analyzer',
    description='this is my sklearn sentiment model'
)
model.to_dict()