In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installing Transformers


In [None]:
pip install transformers

## load the pre-trained BERT model


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


We have the main BERT model, a dropout layer to prevent overfitting, and finally a dense layer for classification task

### Imports


In [1]:
import tensorflow as tf
import pandas as pd

## Large Movie Review Dataset

This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more details.

### Get the Data from the Stanford Repo


@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}

In [6]:
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


### Preprossesing and pereparing the data for Bert model

@Credit:
Orhan G. Yalçın
Nov 28, 2020


In [7]:
# The shutil module offers a number of high-level 
# operations on files and collections of files.
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

['labeledBow.feat', 'pos', 'urls_unsup.txt', 'urls_pos.txt', 'urls_neg.txt', 'unsupBow.feat', 'neg']


## Train and Test Split

In [8]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=123)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [9]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,Canadian director Vincenzo Natali took the art...,1
1,I gave this film 10 not because it is a superb...,1
2,I admit to being somewhat jaded about the movi...,1
3,"For a long time, 'The Menagerie' was my favori...",1
4,A truly frightening film. Feels as if it were ...,0


In [10]:
for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I can't believe that so much talent can be was...,0
1,This movie blows - let's get that straight rig...,0
2,"The saddest thing about this ""tribute"" is that...",0
3,I'm only rating this film as a 3 out of pity b...,0
4,Something surprised me about this movie - it w...,1


In [11]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [12]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [13]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



* Adam as our optimizer :
Adam is an optimization algorithm that can be used instead of the classical stochastic gradient descent procedure to update network weights iterative based in training data.

* CategoricalCrossentropy as our loss function:
Categorical crossentropy is a loss function that is used in multi-class classification tasks. These are tasks where an example can only belong to one out of many possible categories, and the model must decide which one.

* SparseCategoricalAccuracy as our accuracy metric :
This metric creates two local variables, `total` and `count` that are used to
compute the frequency with which `y_pred` matches `y_true`. This frequency is
ultimately returned as `sparse categorical accuracy`: an idempotent operation
that simply divides `total` by `count`.

* epochs ( here only 2):
Epoch refers to one cycle through the full training dataset, or the number of times that a learning algorithm is going to update the weights. 


In [14]:

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

%time model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 2/2
CPU times: user 13min 58s, sys: 12min 28s, total: 26min 26s
Wall time: 36min 31s


<tensorflow.python.keras.callbacks.History at 0x7f496a86ba50>

## Testing the model to predict one line

In [31]:
pred_sentences = ['This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good']

In [32]:

tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good : 
 Positive


In [37]:
tf_predictions


<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[4.9489609e-04, 9.9950504e-01]], dtype=float32)>

In [40]:
tf_outputs

TFSequenceClassifierOutput([('logits',
                             <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-3.8668618,  3.743806 ]], dtype=float32)>)])

In [65]:
tf.argmax(tf_predictions, axis=1)

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([1])>

In [66]:
labels[label[0]]

'Positive'

## Reading the data from one we saved in Jupyter 

In [20]:
data = pd.read_csv('/content/drive/MyDrive/reviews_df.csv')

In [21]:
data.title

0     ‘Jakob’s Wife’ Review: A Cheap and Cheerless V...
1     ‘City of Lies’ Review: Johnny Depp Solves the ...
2     ‘Deadly Illusions’ Review: Kristin Davis Hires...
3     Snyder Cut Justice League review: Still a mess...
4     The Courier movie review: An intimate portraya...
5     ‘Falcon And The Winter Soldier’ Review: Thanks...
6     ‘Wojnarowicz’ Review: A Queer Biography as Bra...
7     Mumbai Saga Public Review: Interesting After T...
8     Movie Review | 'The Courier' is deliberate, in...
9     FilmWeek: ‘The Courier,’ ‘Zack Snyder’s Justic...
10                   The Feast (SXSW 2021 Movie Review)
11    Review: 'Zack Snyder's Justice League' has arr...
12    ‘Deadly Illusions’ Review: Kristin Davis Hires...
13    Mosagallu movie review: This Vishnu Manchu, Ka...
14    Sandeep Aur Pinky Faraar movie review: Two wor...
15    Gaia starring Monique Rockman - (SXSW Horror M...
16    Justice League: Why The Industry Can't Pretend...
17    SXSW Review: See You Then Is a Slow Burn L

In [41]:
for text in data.title:
  tf_batch = tokenizer(text, max_length=128, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = model(tf_batch)
  tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
  labels = ['Negative','Positive']
  label = tf.argmax(tf_predictions, axis=1)
  #label = label.numpy()
  #for i in range(len(pred_sentences)):
  rate = tf_predictions
  score = tf_outputs.logits[0][1]
  print(rate)
  print(score)

tf.Tensor([[0.9988527  0.00114728]], shape=(1, 2), dtype=float32)
tf.Tensor(-3.397287, shape=(), dtype=float32)
tf.Tensor([[9.992549e-01 7.451388e-04]], shape=(1, 2), dtype=float32)
tf.Tensor(-3.5379896, shape=(), dtype=float32)
tf.Tensor([[0.8820931  0.11790695]], shape=(1, 2), dtype=float32)
tf.Tensor(-1.1021264, shape=(), dtype=float32)
tf.Tensor([[0.991621   0.00837898]], shape=(1, 2), dtype=float32)
tf.Tensor(-2.4595747, shape=(), dtype=float32)
tf.Tensor([[6.2001473e-04 9.9937999e-01]], shape=(1, 2), dtype=float32)
tf.Tensor(3.5912619, shape=(), dtype=float32)
tf.Tensor([[0.9815859  0.01841402]], shape=(1, 2), dtype=float32)
tf.Tensor(-2.0974076, shape=(), dtype=float32)
tf.Tensor([[0.0020442 0.9979558]], shape=(1, 2), dtype=float32)
tf.Tensor(2.9894981, shape=(), dtype=float32)
tf.Tensor([[0.04809644 0.9519035 ]], shape=(1, 2), dtype=float32)
tf.Tensor(1.3163056, shape=(), dtype=float32)
tf.Tensor([[0.00278613 0.9972139 ]], shape=(1, 2), dtype=float32)
tf.Tensor(2.770474, shape=

### create a function to get the score and  rate from models output

In [42]:

def bert_Sent(text):
  tf_batch = tokenizer(text, max_length=128, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = model(tf_batch)
  tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
  #label = tf.argmax(tf_predictions, axis=1)
  #label = label.numpy()
  score = tf_outputs.logits[0][1]
  return score

import re

def get_score(text):
  score   = re.search('tf.Tensor\((.+?), shape',text).group(1)
  return score 

def get_rate(text):
  tf_batch = tokenizer(text, max_length=100, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = model(tf_batch)
  tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
  labels = ['Negative','Positive']
  label = tf.argmax(tf_predictions, axis=1)
  label = label.numpy()
  return labels[label[0]]

## Get the score a =n rate for all 50 rows and add them to the table

In [None]:
transf_sc  = [bert_Sent(text) for text in data.title]
data['Bert_sentiment'] = transf_sc
data['Bert_sentiment'] = data['Bert_sentiment'].astype(str)
data['Bert_sentiment'] = [float(get_score(text)) for text in data.Bert_sentiment]

tf_rate  = [get_rate(text) for text in data.title]
data['Bert_rate'] = tf_rate 

data[['nltk_sentiment','TextBlob_sentiment', 'Bert_sentiment']] = data[['nltk_sentiment','TextBlob_sentiment', 'Bert_sentiment']].apply(lambda x: pd.Series.round(x,2)) 
data.head()

In [5]:
data.columns
data[[ 'afinn_sentimen','nltk_sentiment','TextBlob_sentiment','transformers_score','transformers_label','Bert_sentiment','Bert_rate']]

Unnamed: 0,afinn_sentimen,nltk_sentiment,TextBlob_sentiment,transformers_score,transformers_label,Bert_sentiment,Bert_rate
0,-2,-0.4,0.4,1.0,NEGATIVE,-3.4,Negative
1,-3,-0.84,0.0,0.96,NEGATIVE,-3.54,Negative
2,-3,0.2,-0.18,0.99,POSITIVE,-1.1,Negative
3,-1,-0.05,-0.11,1.0,NEGATIVE,-2.46,Negative
4,-2,-0.6,-0.2,1.0,POSITIVE,3.59,Positive
5,-1,-0.2,-0.3,0.92,NEGATIVE,-2.1,Negative
6,0,-0.3,0.0,1.0,POSITIVE,2.99,Positive
7,4,0.68,0.28,1.0,POSITIVE,1.32,Positive
8,2,0.46,0.57,1.0,POSITIVE,2.77,Positive
9,2,0.15,0.5,0.77,POSITIVE,1.3,Positive


In [64]:
data.to_excel('/content/drive/MyDrive/Movie_Reviews_sentiments.xlsx', index = False)
