In [2]:
import sys
import numpy as np
import random as rn
from tqdm import tqdm
import pandas as pd
import tensorflow as tf
import zipfile
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [4]:
#############################################Prepare the data#####################################################

In [5]:
zip_file_path = 'dataset/test.csv.zip'
target_directory = 'dataset/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(target_directory)

In [6]:
train = pd.read_csv('dataset/test.csv')

In [7]:
train.head(5)

Unnamed: 0,class_index,review_title,review_text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [8]:
train['class_index'] = train['class_index'].replace({1: 0, 2: 1})

In [9]:
train.drop('review_title', axis=1, inplace=True)

In [10]:
train.sample()

Unnamed: 0,class_index,review_text
341808,1,"A friend sent it, or I would never have believ..."


In [11]:
train = train[:20000]
validate = train[20000:25000]
print(train.shape)

(20000, 2)


In [12]:
##########################################Intialize model and tokenizer###########################################

In [13]:
!pip install transformers
from transformers import BertTokenizer, TFBertForSequenceClassification



In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

2023-07-31 21:44:11.394285: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
###########################################Preprocess the data ###################################################

In [16]:
#This will accept our train and convert each row into an InputExample object.

from transformers import InputExample, InputFeatures

def convert_data_to_examples(train, test, review, sentiment): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1,)
  
    return train_InputExamples, validation_InputExamples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train,  validate, 'review_text',  'class_index')



In [17]:
#This function will tokenize the InputExample objects, then create the required input format with the tokenized 
#objects, finally, create an input dataset that we can feed to the model.

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'

In [18]:
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

100%|████████████████████████████████████| 20000/20000 [00:23<00:00, 846.44it/s]


In [19]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

  0%|                                                     | 0/2 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'text_a'

In [20]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7f74ce6d2be0>

In [21]:
model.save_pretrained("output_model2")

In [23]:
########################################Test the model on IMDB data#################################################

In [25]:
test = pd.read_csv('dataset/train.csv')
test['sentiment'] = test['sentiment'].replace({'neg': 0, 'pos': 1})
test.sample()

Unnamed: 0,text,sentiment
19714,"I am a great fan of Martin Amis, on whose book...",0


In [27]:
test = test[:25000]
print(test.shape)

(25000, 2)


In [28]:
#This will accept our train and convert each row into an InputExample object.

from transformers import InputExample, InputFeatures

def convert_data_to_examples(test, review, sentiment): 

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1,)
  
    return validation_InputExamples

validation_InputExamples = convert_data_to_examples(test, 'text',  'sentiment')
# finetune_InputExamples = convert_data_to_examples(finetune, 'review_text',  'class_index')

In [29]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

100%|████████████████████████████████████| 25000/25000 [01:32<00:00, 270.90it/s]


In [30]:
loaded_model = TFBertForSequenceClassification.from_pretrained("output_model2")

Some layers from the model checkpoint at output_model2 were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at output_model2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [32]:
loaded_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
result = loaded_model.evaluate(validation_data)
dict(zip(loaded_model.metrics_names, result))



{'loss': 0.63717120885849, 'accuracy': 0.852840006351471}