<h3>- Description:</h3>
<p>Classify Persian review.
Fine-tune a persian pretrained Bert Model from hugging face. Also use on of the hugging face dataset.</p>

In [2]:
# Connect to a GPU runtime in colab.
# Mount the notebook to google-drive.
# Change the current active directory into project's root folder.
%cd /content/drive/MyDrive/Colab\ Notebooks/Persian-Sentiment-Analysis

/content/drive/MyDrive/Colab Notebooks/Persian-Sentiment-Analysis


<h4>1. Install and import required libs.</h4>

In [3]:
!pip install -qr requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m108.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import numpy as np
import tensorflow as tf

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoTokenizer
from transformers import TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Define a class to hold the constant and hyperparametrs in one place.
class Params(object):
  def __init__(self, **kwargs):
    self.__dict__.update(**kwargs)

params = Params(
    dataset_name="sepidmnorozy/Persian_sentiment",
    model_name = "HooshvareLab/bert-base-parsbert-uncased",
    batch_size=32,
    max_sequence_len=128,
)

for param, value in params.__dict__.items():
  print(f"{param:15}: {value}")

dataset_name   : sepidmnorozy/Persian_sentiment
model_name     : HooshvareLab/bert-base-parsbert-uncased
batch_size     : 32
max_sequence_len: 128


<h4>2. Download dataset from hugging face 🤗.</h4>

In [6]:
train_ds = load_dataset(params.dataset_name, split="train")
valid_ds = load_dataset(params.dataset_name, split="validation")
test_ds  = load_dataset(params.dataset_name, split="test")



Downloading and preparing dataset csv/sepidmnorozy--Persian_sentiment to /root/.cache/huggingface/datasets/sepidmnorozy___csv/sepidmnorozy--Persian_sentiment-fa8d6a1018e1ade1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/sepidmnorozy___csv/sepidmnorozy--Persian_sentiment-fa8d6a1018e1ade1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.




<h4>3. Seprate sentences from their labels and add special BERT tokens to each of them.</h4>

In [7]:
# seprate sentences from labels in dataset
train_sentences , train_labels = train_ds["text"], train_ds["label"]
valid_sentences , valid_labels = valid_ds["text"], valid_ds["label"]  
test_sentences, test_labels = test_ds["text"], test_ds["label"]

# add BERT special tokens to each sentence.
train_sentences = ["[CLS] " + s + " [SEP]" for s in train_sentences]
valid_sentences = ["[CLS] " + s + " [SEP]" for s in valid_sentences]
test_sentences  = ["[CLS] "  + s + " [SEP]" for s in test_sentences]

<h4>4. Load tokenizer of desired model.</h4>

In [8]:
tokenizer = AutoTokenizer.from_pretrained(params.model_name)

Downloading:   0%|          | 0.00/434 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

<h4>5. Tokenize the datasets and convert tokens into their coresponding Ids.</h4>

In [9]:
train_sentences = [tokenizer.tokenize(s) for s in train_sentences]
valid_sentences = [tokenizer.tokenize(s) for s in valid_sentences]
test_sentences  = [tokenizer.tokenize(s) for s in test_sentences]

train_sentences = [tokenizer.convert_tokens_to_ids(s) for s in train_sentences]
valid_sentences = [tokenizer.convert_tokens_to_ids(s) for s in valid_sentences]
test_sentences = [tokenizer.convert_tokens_to_ids(s) for s in test_sentences]

<h4>6. Pad the tokenized datasets into same lengths.</h4>

In [10]:
train_ids = pad_sequences(
    train_sentences,
    maxlen=params.max_sequence_len,
    padding="post",
    truncating="post",
    dtype="long",
)

valid_ids = pad_sequences(
    valid_sentences,
    maxlen=params.max_sequence_len,
    padding="post",
    truncating="post",
    dtype="long",
)

test_ids = pad_sequences(
    test_sentences,
    maxlen=params.max_sequence_len,
    padding="post",
    truncating="post",
    dtype="long",
)

<h4>7. Create attention mask.</h4>

In [11]:
# Create attention mask
atten_mask = {
    "train" : (train_ids != 0).astype(np.float32),
    "valid" : (valid_ids != 0).astype(np.float32),
    "test"  : (test_ids != 0).astype(np.float32),
}

<h4>8. Create tf-dataset.</h4>

In [12]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_ids, train_labels, atten_mask["train"])
).batch(params.batch_size).prefetch(1)

valid_ds = tf.data.Dataset.from_tensor_slices(
    (valid_ids, valid_labels, atten_mask["valid"])
).batch(params.batch_size).prefetch(1)

test_ds = tf.data.Dataset.from_tensor_slices(
    (test_ids, test_labels, atten_mask["test"])
).batch(params.batch_size).prefetch(1)

<h4>9. Explore model's config.</h4>

In [13]:
config = AutoConfig.from_pretrained(params.model_name)
print(config)

BertConfig {
  "_name_or_path": "HooshvareLab/bert-base-parsbert-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



<h4>10. Load pretrained model.</h4>

In [14]:
model = TFBertForSequenceClassification.from_pretrained(params.model_name, num_labels=2)

Downloading:   0%|          | 0.00/963M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<h4>11. Define metric, loss and optimizer.</h4>

In [15]:
def flat_accuracy(y_true, y_preds):
  num_instances = tf.cast(tf.shape(y_true)[0], tf.float32)
  y_preds = tf.reshape(tf.argmax(y_preds, axis=1), shape=(-1,))
  y_true = tf.cast(tf.reshape(y_true, shape=(-1,)), dtype=tf.int64)
  return tf.reduce_sum(tf.cast(tf.math.equal(y_preds, y_true), tf.float32)) / num_instances

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(3e-5)

params.num_epochs = 3

<h4>12. Fine-tune the model.</h4>

In [16]:
num_batches = 0
num_valid_batches = 0
for _ in train_ds:
  num_batches += 1

for _ in valid_ds:
  num_valid_batches += 1

print(f"Total number of batches: {num_batches}")
print(f"Total number of validation batches: {num_valid_batches}")

Total number of batches: 1772
Total number of validation batches: 197


In [17]:
# Fine tuning.
loss_history, val_history = [], []
prev_valid_acc = None
for epoch in range(params.num_epochs):
    print(f"Epoch #{epoch+1}/{params.num_epochs}: [Previous valid accuracy: {prev_valid_acc}]")
    
    # =============== Train On one epoch =================== #
    for i, batch in enumerate(train_ds):
        inputs, labels, mask = batch
        with tf.GradientTape() as tape:
            outputs = model(
                inputs,
                token_type_ids=None,
                attention_mask=mask,
                labels=labels
            )
            loss = tf.reduce_sum(outputs["loss"]) / params.batch_size

        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        loss_history.append(loss.numpy())
        print(f"[{i+1}/{num_batches}... BatchLoss: {loss}]")

    # ========== Calculate new accuracy on valid set =============== #
    acc_set = []
    for batch in valid_ds:
        inputs, labels, mask = batch
        logits = model(inputs, token_type_ids=None, attention_mask=mask)
        logits = logits['logits']
        batch_acc = flat_accuracy(labels.numpy(), logits.numpy())
        acc_set.append(batch_acc)
    prev_valid_acc = sum(acc_set) / len(acc_set)
    val_history.append(prev_valid_acc)


Epoch #1/3: [Previous valid accuracy: None]




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[316/1772... BatchLoss: 0.2005525678396225]
[317/1772... BatchLoss: 0.447568416595459]
[318/1772... BatchLoss: 0.4078226089477539]
[319/1772... BatchLoss: 0.3467343747615814]
[320/1772... BatchLoss: 0.4235711097717285]
[321/1772... BatchLoss: 0.288144588470459]
[322/1772... BatchLoss: 0.45168358087539673]
[323/1772... BatchLoss: 0.2409929633140564]
[324/1772... BatchLoss: 0.41004180908203125]
[325/1772... BatchLoss: 0.2726551294326782]
[326/1772... BatchLoss: 0.30984026193618774]
[327/1772... BatchLoss: 0.4035831093788147]
[328/1772... BatchLoss: 0.3804897665977478]
[329/1772... BatchLoss: 0.24880999326705933]
[330/1772... BatchLoss: 0.3263188898563385]
[331/1772... BatchLoss: 0.2627061605453491]
[332/1772... BatchLoss: 0.34267914295196533]
[333/1772... BatchLoss: 0.3045702576637268]
[334/1772... BatchLoss: 0.2874337136745453]
[335/1772... BatchLoss: 0.39503926038742065]
[336/1772... BatchLoss: 0.28594309091567993]
[337/1