<a href="https://colab.research.google.com/github/oonid/growth-hacking-with-nlp-sentiment-analysis/blob/master/create_neural_network_based_sentiment_analyzers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Network Based Sentiment Analyzers

### requirements setup

includes: 

*   transformers
*   simpletransformers
*   tokenizers



In [1]:
!pip install --upgrade transformers  # make sure compatible with tokenizers
!wget https://raw.githubusercontent.com/crow-intelligence/growth-hacking-sentiment/master/requirements.txt
!pip install -r requirements.txt

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 2.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 10.0MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 19.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |███

### Install apex

make sure to switch Google Colab runtime type to GPU hardware accelerator


In [2]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [3]:
!sh setup.sh

Cloning into 'apex'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects:   2% (1/38)[Kremote: Counting objects:   5% (2/38)[Kremote: Counting objects:   7% (3/38)[Kremote: Counting objects:  10% (4/38)[Kremote: Counting objects:  13% (5/38)[Kremote: Counting objects:  15% (6/38)[Kremote: Counting objects:  18% (7/38)[Kremote: Counting objects:  21% (8/38)[Kremote: Counting objects:  23% (9/38)[Kremote: Counting objects:  26% (10/38)[Kremote: Counting objects:  28% (11/38)[Kremote: Counting objects:  31% (12/38)[Kremote: Counting objects:  34% (13/38)[Kremote: Counting objects:  36% (14/38)[Kremote: Counting objects:  39% (15/38)[Kremote: Counting objects:  42% (16/38)[Kremote: Counting objects:  44% (17/38)[Kremote: Counting objects:  47% (18/38)[Kremote: Counting objects:  50% (19/38)[Kremote: Counting objects:  52% (20/38)[Kremote: Counting objects:  55% (21/38)[Kremote: Counting objects:  57% (22/38)[Kremote: Counting obj

In [4]:
# connect google colab to google driver if needed (uncomment 2 lines below)
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [5]:
# all imports and its related

%matplotlib inline

import pandas as pd
import numpy as np

from transformers import pipeline, BertModel, BertTokenizer
from simpletransformers.classification import ClassificationModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


### load the small_corpus CSV

run process from [create_dataset.ipynb](https://github.com/oonid/growth-hacking-with-nlp-sentiment-analysis/blob/master/create_dataset.ipynb)

copy file **small_corpus.csv** to this Google Colab Files (via file upload or mount drive)


In [6]:
df = pd.read_csv('small_corpus.csv')
df

Unnamed: 0,ratings,reviews
0,1,Recently UBISOFT had to settle a huge class-ac...
1,1,"code didn't work, got me a refund."
2,1,"these do not work at all, all i get is static ..."
3,1,well let me start by saying that when i first ...
4,1,"Dont waste your money, you will just end up us..."
...,...,...
4495,5,"Nice long micro USB cable, battery lasts a lon..."
4496,5,I've been having a great time with this game. ...
4497,5,d
4498,5,"Really pretty, funny, interesting game. Works ..."


In [7]:
# check if any columns has null, and yes the reviews column has
df.isnull().any()

ratings    False
reviews     True
dtype: bool

In [8]:
# repair null in column reviews with empty string ''
df.reviews = df.reviews.fillna('')

# test again
df.isnull().any()

ratings    False
reviews    False
dtype: bool

In [9]:
# Setting up transformers pipeline and tokenizer
sentiment_analysis_pipeline = pipeline('sentiment-analysis')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [10]:
for row in pd.concat([df.head(2), df.tail(2)]).itertuples(index=False):
    # print(row.reviews)
    review = row.reviews
    if len(review) > 512: # chop to 512, maximum sequence length for this model
        review = review[:512]
        print('[warning] review get truncated.')
        # TODO find alternative to handle long sentences,
        # TODO shall we tokenize to multiple sentences and process per sentence?
    e = tokenizer.encode(review)
    d = tokenizer.decode(e, skip_special_tokens=True)
    print(d)
    s = sentiment_analysis_pipeline(d)
    print(s)
    print('\n--\n')


recently ubisoft had to settle a huge class - action suit brought against the company for bundling ( the notoriously harmful ) starforce drm with its released games. so what the geniuses at the helm do next? they decide to make the same mistake yet again - by choosing the same drm scheme that made bioshock, mass effect and spore infamous : securom 7. xx with limited activations! mass effect can be found in clearance bins only months after its release ; spore not only undersold miserably but also made history as t
[{'label': 'NEGATIVE', 'score': 0.9991271495819092}]

--

code didn't work, got me a refund.
[{'label': 'NEGATIVE', 'score': 0.9996480345726013}]

--

really pretty, funny, interesting game. works well. i recommend it for all ages.
[{'label': 'POSITIVE', 'score': 0.9998774528503418}]

--

i had a lot of fun playing this game, if your looking for a game to jump into with not much overthinking this is the game
[{'label': 'POSITIVE', 'score': 0.9878434538841248}]

--



In [11]:
def score_review(review):
    if len(review) > 512: # chop to 512, maximum sequence length for this model
        # print('\n[warning] review get chopped.\n{}'.format(review))
        review = review[:512]
        # TODO find alternative to handle long sentences,
        # TODO shall we tokenize to multiple sentences and process per sentence?
    e = tokenizer.encode(review)
    d = tokenizer.decode(e, skip_special_tokens=True)
    s = sentiment_analysis_pipeline(d)
    if len(s) != 1:
        raise ValueError('got multiple sentiment analysis results.')

    # s[0] the first and only sentiment analysis result
    if s[0]['label'] == 'POSITIVE' and s[0]['score'] > 0.85:
        return 1
    elif s[0]['label'] == 'NEGATIVE' and s[0]['score'] > 0.95:
        return -1
    else:  # NEUTRAL
        return 0

sentiment_classes = df['reviews'].apply(score_review)
sentiment_classes

0      -1
1      -1
2      -1
3      -1
4      -1
       ..
4495   -1
4496    1
4497    1
4498    1
4499    1
Name: reviews, Length: 4500, dtype: int64

In [12]:
def categorize_rating(rating):
    if rating == 5:
        return 1  # 'positive'
    elif 2 <= rating <= 4:
        return 0  # 'neutral'
    else:  # rating == 1
        return -1  # 'negative'

rating_classes = df['ratings'].apply(categorize_rating)
rating_classes

0      -1
1      -1
2      -1
3      -1
4      -1
       ..
4495    1
4496    1
4497    1
4498    1
4499    1
Name: ratings, Length: 4500, dtype: int64

In [13]:
df['sentiment classes'] = sentiment_classes
df['rating classes'] = rating_classes
df

Unnamed: 0,ratings,reviews,sentiment classes,rating classes
0,1,Recently UBISOFT had to settle a huge class-ac...,-1,-1
1,1,"code didn't work, got me a refund.",-1,-1
2,1,"these do not work at all, all i get is static ...",-1,-1
3,1,well let me start by saying that when i first ...,-1,-1
4,1,"Dont waste your money, you will just end up us...",-1,-1
...,...,...,...,...
4495,5,"Nice long micro USB cable, battery lasts a lon...",-1,1
4496,5,I've been having a great time with this game. ...,1,1
4497,5,d,1,1
4498,5,"Really pretty, funny, interesting game. Works ...",1,1


In [14]:
y_pred = list(df['sentiment classes'])
y_true = list(df['rating classes'])

print('accuracy score: {}'.format(accuracy_score(y_true=y_true, y_pred=y_pred)))
print('accuracy score (normalized): {}'\
      .format(accuracy_score(y_true=y_true, y_pred=y_pred, normalize=True)))

accuracy score: 0.6106666666666667
accuracy score (normalized): 0.6106666666666667


In [15]:

target_names = ['class Negative', 'class Neutral', 'class Positive']

print('classification report:\n{}'\
      .format(classification_report(y_true=y_true, y_pred=y_pred,
                                    target_names=target_names)))

classification report:
                precision    recall  f1-score   support

class Negative       0.59      0.89      0.71      1500
 class Neutral       0.43      0.08      0.14      1500
class Positive       0.67      0.86      0.75      1500

      accuracy                           0.61      4500
     macro avg       0.56      0.61      0.53      4500
  weighted avg       0.56      0.61      0.53      4500



# Create a training set and a test set

Create a training set and a test set out of the reviews to create a neural network based our on settings.

In [16]:
def encode_labels(label):
    """encode [-1, 0, 1] to [0, 1, 2]"""
    if label == -1:
        return 0  # negative
    elif label == 0:
        return 1  # neutral
    elif label == 1:
        return 2  # positive
    else:
        raise ValueError('unknown label value')
def decode_labels(label):
    """decode [0, 1, 2] to [-1, 0, 1]"""
    if label == 0:  # negative
        return -1
    elif label == 1:  # neutral
        return 0
    elif label == 2:  # positive
        return 1
    else:
        raise ValueError('unknown label value')

encoded_labels = df['rating classes'].apply(encode_labels)
encoded_labels.unique()

array([0, 1, 2])

In [17]:
X = list(df['reviews'])
y = list(encoded_labels)  # positive, neutral, negative

# default test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    test_size=0.25)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))


3375
3375
1125
1125


In [18]:
train_df = pd.DataFrame({
    'text': X_train,
    'labels': y_train,
})
train_df

Unnamed: 0,text,labels
0,The PS3 felt like PS vita coming out the box. ...,2
1,Waste of money. Nothing new.,0
2,When Need for Speed Hot Persuit 2 failed to de...,1
3,Buggy hardly any servers available. Always ask...,0
4,Cool concept but ultimately feels uninspired a...,0
...,...,...
3370,too difficult and not enough hints to even be ...,0
3371,I have had this game since June 5th and have h...,1
3372,"It works fine, the game itself is like a sweat...",1
3373,Great game system. This is perfect for my 7 ye...,2


In [19]:
eval_df = pd.DataFrame({
    'text': X_test,
    'labels': y_test,
})
eval_df

Unnamed: 0,text,labels
0,"Good headset nice mic, picks up a lot of back...",1
1,Fun game. Cool little side story from the main...,2
2,"OK, so I finished Dantes Inferno and it kicked...",1
3,"all excellent , thank you very much",2
4,I have played plenty of rpg and plenty of beth...,0
...,...,...
1120,Best PC controller currently available. So muc...,2
1121,To start: Left 4 Dead 2 is not a sequel but th...,1
1122,If you like games with Amazing graphics and pr...,0
1123,Basically... This game sucks. The graphics suc...,0


# Create our own neural network-based sentiment classifier

### Create model

In [20]:
# Create a ClassificationModel
model = ClassificationModel(model_type='distilbert',
                            model_name='distilbert-base-uncased',
                            num_labels=3,
                            args={
                              'use_cuda': True, 
                              'max_seq_length': 128,
                              'num_train_epochs': 10,
                              'output_dir': 'model_dir/',
                              'best_model_dir': 'model_dir/',
                              'evaluate_during_training': True,
                              'num_training_epochs': 20,
                              'train_batch_size': 20,
                              'eval_batch_size': 20
                            })

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




### Training

also evaluate during training (see argument `evaluate_during_training` set as True).

In [21]:
# Train the model
model.train_model(train_df=train_df, eval_df=eval_df)

HBox(children=(FloatProgress(value=0.0, max=3375.0), HTML(value='')))


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=169.0, style=ProgressStyle(descript…

Running loss: 1.137975



Running loss: 1.074314



Running loss: 0.916869Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Running loss: 0.595267




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=169.0, style=ProgressStyle(descript…

Running loss: 0.816381Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Running loss: 0.229146


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=169.0, style=ProgressStyle(descript…

Running loss: 0.365038


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3', max=169.0, style=ProgressStyle(descript…

Running loss: 0.019992


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4', max=169.0, style=ProgressStyle(descript…

Running loss: 0.104914


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5', max=169.0, style=ProgressStyle(descript…

Running loss: 0.002367


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6', max=169.0, style=ProgressStyle(descript…

Running loss: 0.012032


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7', max=169.0, style=ProgressStyle(descript…

Running loss: 0.014835


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8', max=169.0, style=ProgressStyle(descript…

Running loss: 0.001551


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9', max=169.0, style=ProgressStyle(descript…

Running loss: 0.000768



In [22]:
result, model_outputs, wrong_predictions = \
    model.eval_model(eval_df, acc=accuracy_score)
print(result)


HBox(children=(FloatProgress(value=0.0, max=1125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=57.0, style=ProgressStyle(descri…


{'mcc': 0.4799381831618073, 'acc': 0.6515555555555556, 'eval_loss': 1.8006001091848143}


In [23]:
y_pred = list(model_outputs.argmax(axis=1))
y_true = y_test

print('accuracy score: {}'.format(accuracy_score(y_true=y_true, y_pred=y_pred)))
print('accuracy score (normalized): {}'\
      .format(accuracy_score(y_true=y_true, y_pred=y_pred, normalize=True)))

target_names = ['class Negative', 'class Neutral', 'class Positive']

print('classification report:\n{}'\
      .format(classification_report(y_true=y_true, y_pred=y_pred,
                                    target_names=target_names)))

accuracy score: 0.6515555555555556
accuracy score (normalized): 0.6515555555555556
classification report:
                precision    recall  f1-score   support

class Negative       0.70      0.63      0.66       375
 class Neutral       0.53      0.63      0.57       375
class Positive       0.77      0.69      0.73       375

      accuracy                           0.65      1125
     macro avg       0.67      0.65      0.66      1125
  weighted avg       0.67      0.65      0.66      1125



# Experiment with different parameter settings

**ATTN for Colab User with GPU**

before continue, please stop few minutes,
(after training above model).
if you continue the training directly, the GPU memory will not enough.

### Create model

In [24]:
# Create a ClassificationModel
model512 = ClassificationModel(model_type='distilbert',
                               model_name='distilbert-base-uncased',
                               num_labels=3,
                               args={
                                   'use_cuda': True, 
                                   'max_seq_length': 512,
                                   'sliding_window': True,
                                   'num_train_epochs': 10,
                                   'output_dir': 'model512_dir/',
                                   'best_model_dir': 'model512_dir/',
                                   'evaluate_during_training': True,
                                   'num_training_epochs': 20,
                                   'train_batch_size': 20,
                                   'eval_batch_size': 20,
                                })

### Training

also evaluate during training (see argument `evaluate_during_training` set as True).

In [25]:
# Train the model
model512.train_model(train_df=train_df, eval_df=eval_df)

HBox(children=(FloatProgress(value=0.0, max=3375.0), HTML(value='')))


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=193.0, style=ProgressStyle(descript…

Running loss: 1.112055



Running loss: 1.023448



Running loss: 0.877982Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Running loss: 0.806643




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=193.0, style=ProgressStyle(descript…

Running loss: 0.896986Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Running loss: 0.113438


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=193.0, style=ProgressStyle(descript…

Running loss: 0.077462


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3', max=193.0, style=ProgressStyle(descript…

Running loss: 0.033306


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4', max=193.0, style=ProgressStyle(descript…

Running loss: 2.007909Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0



HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5', max=193.0, style=ProgressStyle(descript…

Running loss: 0.001328


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6', max=193.0, style=ProgressStyle(descript…

Running loss: 0.000496


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7', max=193.0, style=ProgressStyle(descript…

Running loss: 0.003133


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8', max=193.0, style=ProgressStyle(descript…

Running loss: 0.001142


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9', max=193.0, style=ProgressStyle(descript…

Running loss: 0.000359



In [26]:
result, model_outputs, wrong_predictions = \
    model512.eval_model(eval_df, acc=accuracy_score)
print(result)
if isinstance(model_outputs, list):
    flat_outputs = []
    for o in model_outputs:
        if o.shape != (1, 3):
            # get mean value from multiple results
            oo = np.sum(o, axis=0)
            oo = np.reshape(oo, (-1, 3))
            flat_outputs.append(oo)
        else:
            flat_outputs.append(o)
    # overwrite with flat outputs, get first index because after stack (1, 1125, 3)
    model_outputs = np.stack(flat_outputs, axis=1)[0]
    print(model_outputs.shape)

HBox(children=(FloatProgress(value=0.0, max=1125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=66.0, style=ProgressStyle(descri…


{'mcc': 0.502159488736401, 'acc': 0.6675555555555556, 'eval_loss': 1.9439644136212089}
(1125, 3)


In [27]:

y_pred = list(model_outputs.argmax(axis=1))
y_true = y_test

print('accuracy score: {}'.format(accuracy_score(y_true=y_true, y_pred=y_pred)))
print('accuracy score (normalized): {}'\
      .format(accuracy_score(y_true=y_true, y_pred=y_pred, normalize=True)))

target_names = ['class Negative', 'class Neutral', 'class Positive']

print('classification report:\n{}'\
      .format(classification_report(y_true=y_true, y_pred=y_pred,
                                    target_names=target_names)))

accuracy score: 0.6737777777777778
accuracy score (normalized): 0.6737777777777778
classification report:
                precision    recall  f1-score   support

class Negative       0.71      0.67      0.69       375
 class Neutral       0.54      0.61      0.58       375
class Positive       0.79      0.73      0.76       375

      accuracy                           0.67      1125
     macro avg       0.68      0.67      0.68      1125
  weighted avg       0.68      0.67      0.68      1125

