In [None]:
# Sentiment Analysis
'''
    Model : bert-base
    Dataset : Rotten Tomato Movie Review Dataset
    Info : 0 -> Negative / 1 -> Positive
'''

##### Configuration

In [None]:
!pip install transformers

In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

# Get the data

In [None]:
# URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
#                                   origin=URL,
#                                   untar=True,
#                                   cache_dir='.',
#                                   cache_subdir='')

In [None]:
# # The shutil module offers a number of high-level operations on files and collections of files.
# import os
# import shutil
# # Create main directory path ("/aclImdb")
# main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# # Create sub directory path ("/aclImdb/train")
# train_dir = os.path.join(main_dir, 'train')
# # Remove unsup folder since this is a supervised learning task
# remove_dir = os.path.join(train_dir, 'unsup')
# shutil.rmtree(remove_dir)
# # View the final train folder
# print(os.listdir(train_dir))

In [None]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=123)

In [None]:
import pandas as pd
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['text', 'label']
train['text'] = train['text'].str.decode("utf-8")
train.head()

In [None]:
for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['text', 'label']
test['text'] = test['text'].str.decode("utf-8")
test.head()

# Loading pre-trained BERT

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
# Loading Tkenizer and Model (bert-base-uncased)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

In [None]:
tokenizer

In [None]:
model

In [None]:
model.summary()

# Prepare the data

#### Steps for preparing data : 
#### 1) Filtering / removing un-necessary words/tokens from dataset
#### 2) Tokenization
#### 3) Padding
#### 4) Positional Embeddings / Word Embeddings

In [None]:
# from sklearn.model_selection import train_test_split
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# print(train_df.shape, test_df.shape)

In [None]:
# train_df["text"][0]

In [None]:
# train_df["label"][0]

In [None]:
from transformers import InputExample, InputFeatures

def convert_data_to_examples(train, test, data, label):
    train_input_examples = train.apply(lambda x: InputExample(
        guid = None,
        text_a = x[data],
        text_b = None,
        label = x[label]
    ), axis=1)
    test_input_examples = test.apply(lambda x: InputExample(
        guid = None,
        text_a = x[data],
        text_b = None,
        label = x[label]
    ), axis=1)
    
    
    return train_input_examples, test_input_examples


train_input_examples, test_input_examples = convert_data_to_examples(train, test, 'text', 'label')

In [None]:
# def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
#     features = []
    
#     for example in examples:
#         input_dict = tokenizer.encode_plus(
#             example.text_a,
#             add_special_tokens = True,
#             max_length = max_length,
#             return_token_type_ids = True,
#             return_attention_mask = True,
#             pad_to_max_length = True,
#             truncation = True
#         )
# #         method 'input_dict' returns a dictionary with following keys
#         input_ids, token_type_ids, attention_mask = (input_dict["input_ids"], input_dict["token_type_ids"], input_dict["attention_mask"])
    
#         features.append(
#             InputFeatures(
#                 input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=example.label
#             )
#         )
        
#         def gen():
#             for f in features:
#                 yield(
#                     {
#                         "input_ids":f.input_ids,
#                         "token_type_ids":f.token_type_ids,
#                         "attention_mask":f.attention_mask,
#                     },
#                     f.label,
#                 )
                
#         return tf.data.Dataset.from_generator(
#             gen,
#             ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
#             (
#                 {
#                     "input_ids": tf.TensorShape([None]),
#                     "attention_mask": tf.TensorShape([None]),
#                     "token_type_ids": tf.TensorShape([None]),
#                 },
#                 tf.TensorShape([]),
#             ),
#         ) 

In [None]:
sentence = "My name is rajesh."
print(tokenizer.tokenize(sentence))

In [None]:
import tensorflow as tf
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []
    
    for example in examples:
        input_dict = tokenizer.encode_plus(
            list(example.text_a),
            add_special_tokens = True,
            max_length = max_length,
            return_token_type_ids = True,
            return_attention_mask = True,
            pad_to_max_length = True,
            truncation = True
        )
#         method 'input_dict' returns a dictionary with following keys
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"], input_dict["token_type_ids"], input_dict["attention_mask"])
    
        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=example.label
            )
        )
        
        def gen():
            for f in features:
                yield(
                    {
                        "input_ids":f.input_ids,
                        "token_type_ids":f.token_type_ids,
                        "attention_mask":f.attention_mask,
                    },
                    f.label,
                )
                
        return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )

In [None]:
train_data = convert_examples_to_tf_dataset(list(train_input_examples),tokenizer)

test_data = convert_examples_to_tf_dataset(list(test_input_examples),tokenizer)

In [None]:
print(train_data.cardinality())

In [None]:
for data, label in train_data:
    print('Input IDs:', data['input_ids'])
    print('Attention Mask:', data['attention_mask'])
    print('Token Type IDs:', data['token_type_ids'])
    print('Labels:', label)

In [None]:
count = 0
for _ in train_data:
    count += 1

print("Dataset Size:", count)

In [None]:
# Our data is now ready to be fed into the model

# Select a model to train

In [None]:
# Configuration of our model
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy
model.compile(optimizer=Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[SparseCategoricalAccuracy('accuracy')])

In [None]:
tf.config.list_physical_devices()

In [None]:
model.fit(train_data, epochs=2, validation_data=test_data)

# Fine tune the model

# Test (test.csv)

In [None]:
pred_sentences = ['I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played Thunderbirds before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.']

In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

# Deployment (AWS/Gradio)