# Sentiment Analysis with BERT

In [1]:
# pip install datasets // datasets from hugging face transformers
from datasets import load_dataset

In [2]:
raw_datasets = load_dataset("imdb")

Reusing dataset imdb (C:\Users\HP\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
raw_datasets['train']['label'][0]

0

In [5]:
raw_datasets['train']['text'][0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [7]:
def tokenize(sequence):
    tokens = tokenizer.encode_plus(sequence, max_length=512,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [8]:
import numpy as np
num_of_elements = 5000

Xids = np.zeros((num_of_elements, 512))
Xmask = np.zeros((num_of_elements, 512))

idx = np.random.randint(0, 25000, num_of_elements)  
small_dataset = np.array(raw_datasets['train']['text'])[idx.astype(int)]
small_dataset_labels = np.array(raw_datasets['train']['label'])[idx.astype(int)]

labels = np.array(small_dataset_labels)

In [9]:
for i, sequence in enumerate(small_dataset):
    tokens = tokenize(sequence)
    Xids[i, :], Xmask[i, :] = tokens[0], tokens[1]

In [10]:
 Xids[0]

array([  101., 11750.,   119.,  1327.,   170.,  7310.,  1273.,   119.,
        1109.,  5444.,  1110.,  2212.,  3264.,  1122.,  2691.,  1142.,
        1110.,  1103.,  1178.,  1273.,  1637.,  1118., 12590.,  7535.,
        1179., 12068.,   117.,   146.,  2810.,  1119.,  1144.,  1167.,
        2801.,  1107.,  1140.,   119.,   133.,  9304.,   120.,   135.,
         133.,  9304.,   120.,   135.,  1109.,  3176.,  1110.,  4841.,
       24891.,  1162.,   119., 16513.,  9257., 15330.,  1112.,  8166.,
        9747.,  1108.,  6929.,   118.,   118.,  1304.,  2379.,  5939.,
         117.,  1105., 17584., 16920.,  1108.,   170., 13657.,   118.,
        1304.,  1129.,  7174., 12598.,  1112.,  1103.,   179., 17535.,
        1385.,  2361.,  1776.,   119.,   133.,  9304.,   120.,   135.,
         133.,  9304.,   120.,   135.,  1109.,  5945.,  1108.,  1304.,
        3903.,   117., 18111.,  1443.,  1217., 10827.,   119.,   133.,
        9304.,   120.,   135.,   133.,  9304.,   120.,   135.,  1409.,
      

In [11]:
from transformers import TFAutoModel

In [12]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [14]:
import tensorflow as tf

input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)

# Classifier head
x = tf.keras.layers.Dense(1024, activation ='relu')(embeddings)
y = tf.keras.layers.Dense(1, activation ='sigmoid', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable = False

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [15]:
bert.bert(input_ids, attention_mask=mask) # outputs of bert

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<KerasTensor: shape=(None, 512, 768) dtype=float32 (created by layer 'bert')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'bert')>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 1024)         787456      bert[0][1]                   

In [17]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

In [18]:
history = model.fit(
    [Xids, Xmask], labels,
    validation_split=0.8,
    batch_size = 16,
    verbose = 1,
    epochs=1)

