In [None]:
!pip install hazm
!pip install transformers

* Some parts are copied from Hooshvare's [notebook](https://colab.research.google.com/github/hooshvare/parsbert/blob/master/notebooks/Taaghche_Sentiment_Analysis.ipynb)

In [1]:
import numpy as np
import pandas as pd
import hazm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle


import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import collections

from transformers import get_linear_schedule_with_warmup
from transformers import BertConfig, BertTokenizer
from transformers import BertModel


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW




### Load the data using Pandas

In [3]:
data = pd.read_csv('/kaggle/input/snappfood-persian-sentiment-analysis/Snappfood - Sentiment Analysis.csv', on_bad_lines='skip' , delimiter='\t')
data.head()

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0


### Data Cleaning

In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  520 non-null    object 
 1   comment     70000 non-null  object 
 2   label       70000 non-null  object 
 3   label_id    69480 non-null  float64
dtypes: float64(1), object(3)
memory usage: 2.1+ MB
None


In [5]:
unique_labels = list(sorted(data['label'].unique()))
unique_labels

['0', '1', 'HAPPY', 'SAD']

In [6]:
data[data['label']=="0"]  #invalid rows

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
243,- سریع رسید ۲- بسته بندی مناسب بود ۳- دور چین ...,HAPPY,0,
415,ساعت ۳۰ دقیقه طول کشید تا آورد اصلا وقت مردم ...,HAPPY,0,
2173,. سس‌هایی که در تصویر کنار سیب‌زمینی هستن نبود...,HAPPY,0,
3905,تا نوشابه سفارش داده شده بود، یکی آوردن، به پ...,HAPPY,0,
5114,روز تا تاریخ انقضای پنیرها مانده است.,HAPPY,0,
...,...,...,...,...
68501,دقیقه زودتر از زمان اعلام شده بدستم رسید. ممنون,HAPPY,0,
68759,عدد بستنی معجون کاله سفارش داده بودم که ۴ تا ...,HAPPY,0,
69168,. غذا عالی بود ۲. در عرض کمتر از نیم ساعت غذا ...,HAPPY,0,
69467,. مقدار مخلفات داخل برگر خیلی کم. ۲. کاغذ بسته...,HAPPY,0,


In [7]:
print(data.isnull().sum())

Unnamed: 0    69480
comment           0
label             0
label_id        520
dtype: int64


In [8]:
data.dropna(inplace=True, subset=['label', 'label_id'])
data.drop_duplicates(keep='first', inplace=True)
data['label_id'] = data['label_id'].astype(int)

In [9]:
data['comment_len'] = data['comment'].apply(lambda t: len(hazm.word_tokenize(t)))

In [10]:
min_max_len = data["comment_len"].min(), data["comment_len"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

Min: 2 	Max: 378


In [11]:
data_length = data['comment_len'].values
data_glt = sum([1 for length in data_length if length <= 100])
data_glt_rate = (data_glt / len(data_length)) * 100
print(f'Texts with word length of less than 100 includes {data_glt_rate:.2f}% of the whole!')

Texts with word length of less than 100 includes 99.53% of the whole!


In [12]:
data['comment_len'] = data['comment_len'].apply(lambda len_t: len_t if len_t <= 100 else None)
data = data.dropna(subset=['comment_len'])
data = data.reset_index(drop=True)

In [13]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=data['comment_len']
))

fig.update_layout(
    title_text='Distribution of word counts within comments',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [14]:
fig = go.Figure()

groupby_rate = data.groupby('label_id')['label_id'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_rate.index)),
    y=groupby_rate.tolist(),
    text=groupby_rate.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of label id within comments',
    xaxis_title_text='Lable ID',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [15]:
normalizer = hazm.Normalizer()

In [16]:
data['normalized_comment'] = data['comment'].apply(lambda c: normalizer.normalize(c.strip()))

In [17]:
data = data[['normalized_comment','label_id']]
data.columns = ['comment', 'label_id']
data.head()

Unnamed: 0,comment,label_id
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,1
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,0
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,1
3,عالی بود همه چه درست و به اندازه و کیفیت خوب، ...,0
4,شیرینی وانیلی فقط یک مدل بود.,0


### Data splitting


In [18]:
train, remain = train_test_split(data, test_size=0.20, random_state=1)
test, valid = train_test_split(remain, test_size=0.50, random_state=1)

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

print(train.shape)
print(valid.shape)
print(test.shape)

(55324, 2)
(6916, 2)
(6916, 2)


### Configuration

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.')
else:
    print('CUDA is available!')

device: cuda:0
CUDA is available!


In [20]:
MODEL_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [21]:
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
config = BertConfig.from_pretrained(MODEL_PATH)

print(config.to_json_string())

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



### Input Embeddings

In [22]:
idx = np.random.randint(0, len(train))
sample_comment = train.iloc[idx]['comment']
tokens = tokenizer.tokenize(sample_comment)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'  Comment: {sample_comment}')
print(f'   Tokens: {tokenizer.convert_tokens_to_string(tokens)}')
print(f'Token IDs: {token_ids}')

  Comment: گوشت چیزبرگر خام بود و خوب پخته نشده بود
   Tokens: گوشت چیزبرگر خام بود و خوب پخته نشده بود
Token IDs: [5835, 4370, 20215, 5014, 2834, 1379, 4124, 11208, 4338, 2834]


In [23]:
encoding = tokenizer.encode_plus(
    sample_comment,
    max_length=32,
    truncation=True,
    add_special_tokens=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt', 
)

for k in encoding.keys():
    print(f'{k}:\n{encoding[k]}')

input_ids:
tensor([[    2,  5835,  4370, 20215,  5014,  2834,  1379,  4124, 11208,  4338,
          2834,     4,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])
token_type_ids:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])


### Dataset

In [25]:
class SnappFoodDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, comments, targets=None, max_len=128):
        self.comments = comments
        self.targets = targets

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')

        inputs = {
            'comment': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        inputs['targets'] = torch.tensor(target, dtype=torch.long)
        return inputs


In [26]:
train_data_loader = torch.utils.data.DataLoader(SnappFoodDataset(comments=train['comment'].to_numpy(), targets=train['label_id'].to_numpy(), tokenizer=tokenizer), batch_size=16)
valid_data_loader = torch.utils.data.DataLoader(SnappFoodDataset(comments=valid['comment'].to_numpy(), targets=valid['label_id'].to_numpy(), tokenizer=tokenizer), batch_size=16)
test_data_loader = torch.utils.data.DataLoader(SnappFoodDataset(comments=test['comment'].to_numpy(), targets=test['label_id'].to_numpy(), tokenizer=tokenizer), batch_size=16)

### Defining the Model

In [27]:
class SentimentModel(nn.Module):

    def __init__(self, config):
        super(SentimentModel, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_PATH, return_dict=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [28]:
pt_model = SentimentModel(config=config)
pt_model = pt_model.to(device)

print('pt_model', type(pt_model))

Downloading pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

pt_model <class '__main__.SentimentModel'>


### Training

In [29]:
def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = (y_true == y_pred).mean()
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss

In [30]:
def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):

            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['targets']

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)

    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss

In [31]:
def train_op(model,
             data_loader,
             loss_fn,
             optimizer,
             scheduler,
             step=0,
             print_every_step=100,
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None,
             clip=0.0):

    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1

        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['targets']

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)

        _, preds = torch.max(outputs, dim=1)

        loss = loss_fn(outputs, targets)

        losses.append(loss.item())

        loss.backward()

        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        optimizer.step()

        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        if eval:
            train_y, train_loss = y_loss(y_true, y_pred, losses)
            train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

            if step % print_every_step == 0:
                eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
                eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

                if hasattr(eval_cb, '__call__'):
                    eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step, eval_loss_min

In [32]:
EPOCHS = 3
optimizer = AdamW(pt_model.parameters(), lr=5e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)

In [33]:
def eval_callback(epoch, epochs, output_path):
    def eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min):
        statement = ''
        statement += 'Epoch: {}/{}...'.format(epoch, epochs)
        statement += 'Step: {}...'.format(step)

        statement += 'Train Loss: {:.6f}...'.format(train_loss)
        statement += 'Train Acc: {:.3f}...'.format(train_score['acc'])

        statement += 'Valid Loss: {:.6f}...'.format(eval_loss)
        statement += 'Valid Acc: {:.3f}...'.format(eval_score['acc'])

        print(statement)

        if eval_loss <= eval_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                eval_loss_min,
                eval_loss))

            torch.save(model.state_dict(), output_path)
            eval_loss_min = eval_loss

        return eval_loss_min


    return eval_cb

In [34]:
for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step, eval_loss_min = train_op(
        model=pt_model,
        data_loader=train_data_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        step=step,
        print_every_step=1000,
        eval=True,
        eval_cb=eval_callback(epoch, EPOCHS, OUTPUT_PATH),
        eval_loss_min=eval_loss_min,
        eval_data_loader=valid_data_loader)

    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

    eval_y, eval_loss = eval_op(
        model=pt_model,
        data_loader=valid_data_loader,
        loss_fn=loss_fn)

    eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    history['val_acc'].append(eval_score['acc'])
    history['val_loss'].append(eval_loss)

Epochs... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/3458 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 1/3...Step: 1000...Train Loss: 0.371943...Train Acc: 0.839...Valid Loss: 0.381707...Valid Acc: 0.855...
Validation loss decreased (inf --> 0.381707).  Saving model ...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 1/3...Step: 2000...Train Loss: 0.349686...Train Acc: 0.851...Valid Loss: 0.315084...Valid Acc: 0.870...
Validation loss decreased (0.381707 --> 0.315084).  Saving model ...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 1/3...Step: 3000...Train Loss: 0.340710...Train Acc: 0.856...Valid Loss: 0.318467...Valid Acc: 0.865...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Training... :   0%|          | 0/3458 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 2/3...Step: 4000...Train Loss: 0.294543...Train Acc: 0.877...Valid Loss: 0.314284...Valid Acc: 0.868...
Validation loss decreased (0.315084 --> 0.314284).  Saving model ...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 2/3...Step: 5000...Train Loss: 0.274472...Train Acc: 0.888...Valid Loss: 0.349641...Valid Acc: 0.868...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 2/3...Step: 6000...Train Loss: 0.265856...Train Acc: 0.893...Valid Loss: 0.327696...Valid Acc: 0.870...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Training... :   0%|          | 0/3458 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 3/3...Step: 7000...Train Loss: 0.247675...Train Acc: 0.895...Valid Loss: 0.351747...Valid Acc: 0.869...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 3/3...Step: 8000...Train Loss: 0.200101...Train Acc: 0.923...Valid Loss: 0.388242...Valid Acc: 0.868...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 3/3...Step: 9000...Train Loss: 0.173868...Train Acc: 0.935...Valid Loss: 0.372067...Valid Acc: 0.864...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

Epoch: 3/3...Step: 10000...Train Loss: 0.162670...Train Acc: 0.940...Valid Loss: 0.386659...Valid Acc: 0.867...


Evaluation... :   0%|          | 0/433 [00:00<?, ?it/s]

### Prediction

In [35]:
def predict(model, test_data_loader):
    predictions = []
    prediction_probs = []

    model.eval()
    with torch.no_grad():
        for dl in tqdm(test_data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [36]:
preds, probs = predict(pt_model, test_data_loader)
print(preds.shape, probs.shape)

  0%|          | 0/433 [00:00<?, ?it/s]

(6916,) (6916, 2)


In [37]:
y_test, y_pred = test['label_id'].values, preds
label_list = ['HAPPY', 'SAD']
print(classification_report(y_test, y_pred, target_names=label_list))

              precision    recall  f1-score   support

       HAPPY       0.89      0.84      0.87      3560
         SAD       0.84      0.89      0.87      3356

    accuracy                           0.87      6916
   macro avg       0.87      0.87      0.87      6916
weighted avg       0.87      0.87      0.87      6916

