In [19]:
import sys
import numpy as np
import random as rn
from tqdm import tqdm
import pandas as pd
import tensorflow as tf
import zipfile
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
# from torchnlp.datasets import imdb_dataset      # --> We are using our own uploaded dataset.
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [21]:
# zip_file_path = 'dataset/train.csv.zip'
zip_file_path = 'dataset/IMDB Dataset.csv.zip'
target_directory = 'dataset/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(target_directory)


In [22]:
# train_data = pd.read_csv('dataset/train.csv')
# print(train_data['sentiment'].isna().any())
# print(train_data['text'].isna().any())
# changing positive and negative into numeric values

def cat2num(value):
    if value=='positive': 
        return 1
    else: 
        return 0

# df=pd.read_csv("dataset/train.csv")
df = pd.read_csv("dataset/IMDB Dataset.csv")
df['sentiment']  =  df['sentiment'].apply(cat2num)
train = df[:45000]
test = df[45000:]

In [23]:
train.head(5)
print(train.shape)
print(test.shape)

(45000, 2)
(5000, 2)


In [24]:
# train = train_data[:20000]
# test = train_data[20000:]
# train['sentiment'] = train['sentiment'].map({'positive' : 0, 'negative' : 1})
# test['sentiment']=test['sentiment'].map({'positive' : 0, 'negative' : 1})

In [25]:
train.sample()

Unnamed: 0,review,sentiment
32575,I've been trying to write a plot summary for s...,0


In [26]:
!pip install transformers
from transformers import BertTokenizer, TFBertForSequenceClassification



In [27]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# But first see BERT tokenizer exmaples and other required stuff!

example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'
tokens=tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)

['in', 'this', 'ka', '##ggle', 'notebook', ',', 'i', 'will', 'do', 'sentiment', 'analysis', 'using', 'bert', 'with', 'hugging', '##face']
[1999, 2023, 10556, 24679, 14960, 1010, 1045, 2097, 2079, 15792, 4106, 2478, 14324, 2007, 17662, 12172]


In [29]:
#This will accept our train and convert each row into an InputExample object.

from transformers import InputExample, InputFeatures

def convert_data_to_examples(train, test, review, sentiment): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1,)
  
    return train_InputExamples, validation_InputExamples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train,  test, 'review',  'sentiment')


In [30]:
#This function will tokenize the InputExample objects, then create the required input format with the tokenized 
#objects, finally, create an input dataset that we can feed to the model.

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'

In [31]:
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

100%|████████████████████████████████████| 45000/45000 [06:02<00:00, 124.22it/s]


In [32]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

100%|██████████████████████████████████████| 5000/5000 [00:39<00:00, 127.62it/s]


In [33]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7f2258124160>

In [34]:
# model.save_pretrained("output_model")
model.save_pretrained("output_model3")

In [43]:
test = pd.read_csv('dataset/test.csv')
test['class_index'] = test['class_index'].replace({1: 0, 2: 1})
test.drop('review_title', axis=1, inplace=True)
test.sample()

Unnamed: 0,class_index,review_text
126676,1,This one is super stable because it is running...


In [44]:
# finetune = test[25000:25100]
test = test[:25000]
print(test.shape)

(25000, 2)


In [45]:
#This will accept our train and convert each row into an InputExample object.

from transformers import InputExample, InputFeatures

def convert_data_to_examples(test, review, sentiment): 

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1,)
  
    return validation_InputExamples

validation_InputExamples = convert_data_to_examples(test, 'review_text',  'class_index')
# finetune_InputExamples = convert_data_to_examples(finetune, 'review_text',  'class_index')

In [46]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

100%|████████████████████████████████████| 25000/25000 [00:31<00:00, 803.45it/s]


In [39]:
# finetune_data = convert_examples_to_tf_dataset(list(finetune_InputExamples), tokenizer)
# finetune_data = finetune_data.batch(32)

In [47]:
loaded_model = TFBertForSequenceClassification.from_pretrained("output_model")
loaded_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])


Some layers from the model checkpoint at output_model were not used when initializing TFBertForSequenceClassification: ['dropout_227']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at output_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [48]:
# loaded_model.fit(finetune_data, epochs=2)

In [49]:
# loaded_model = TFBertForSequenceClassification.from_pretrained("output_model")
result = loaded_model.evaluate(validation_data)
dict(zip(loaded_model.metrics_names, result))



{'loss': 6.183012008666992, 'accuracy': 0.48627999424934387}