<h3>- Description:</h3>
<p>Classify Persian review.
Fine-tune a persian pretrained Bert Model from hugging face. Also use on of the hugging face dataset.</p>

In [1]:
# Connect to a GPU runtime in colab.
# Mount the notebook to google-drive.
# Change the current active directory into project's root folder.
%cd /content/drive/MyDrive/Colab\ Notebooks/Persian-Sentiment-Analysis

/content/drive/MyDrive/Colab Notebooks/Persian-Sentiment-Analysis


<h4>1. Install and import required libs.</h4>

In [2]:
!pip install -qr requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import numpy as np
import tensorflow as tf

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoTokenizer
from transformers import TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# Define a class to hold the constant and hyperparametrs in one place.
class Params(object):
  def __init__(self, **kwargs):
    self.__dict__.update(**kwargs)

params = Params(
    dataset_name="sepidmnorozy/Persian_sentiment",
    model_name = "HooshvareLab/bert-base-parsbert-uncased",
    batch_size=32,
    max_sequence_len=128,
)

for param, value in params.__dict__.items():
  print(f"{param:15}: {value}")

dataset_name   : sepidmnorozy/Persian_sentiment
model_name     : HooshvareLab/bert-base-parsbert-uncased
batch_size     : 32
max_sequence_len: 128


<h4>2. Download dataset from hugging face 🤗.</h4>

In [5]:
train_ds = load_dataset(params.dataset_name, split="train")
valid_ds = load_dataset(params.dataset_name, split="validation")
test_ds  = load_dataset(params.dataset_name, split="test")



Downloading and preparing dataset csv/sepidmnorozy--Persian_sentiment to /root/.cache/huggingface/datasets/sepidmnorozy___csv/sepidmnorozy--Persian_sentiment-fa8d6a1018e1ade1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/sepidmnorozy___csv/sepidmnorozy--Persian_sentiment-fa8d6a1018e1ade1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.




<h4>3. Seprate sentences from their labels and add special BERT tokens to each of them.</h4>

In [6]:
# seprate sentences from labels in dataset
train_sentences , train_labels = train_ds["text"], train_ds["label"]
valid_sentences , valid_labels = valid_ds["text"], valid_ds["label"]  
test_sentences, test_labels = test_ds["text"], test_ds["label"]

# add BERT special tokens to each sentence.
train_sentences = ["[CLS] " + s + " [SEP]" for s in train_sentences]
valid_sentences = ["[CLS] " + s + " [SEP]" for s in valid_sentences]
test_sentences  = ["[CLS] "  + s + " [SEP]" for s in test_sentences]

<h4>4. Load tokenizer of desired model.</h4>

In [7]:
tokenizer = AutoTokenizer.from_pretrained(params.model_name)

Downloading:   0%|          | 0.00/434 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

<h4>5. Tokenize the datasets and convert tokens into their coresponding Ids.</h4>

In [8]:
train_sentences = [tokenizer.tokenize(s) for s in train_sentences]
valid_sentences = [tokenizer.tokenize(s) for s in valid_sentences]
test_sentences  = [tokenizer.tokenize(s) for s in test_sentences]

train_sentences = [tokenizer.convert_tokens_to_ids(s) for s in train_sentences]
valid_sentences = [tokenizer.convert_tokens_to_ids(s) for s in valid_sentences]
test_sentences = [tokenizer.convert_tokens_to_ids(s) for s in test_sentences]