In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<div style="text-align: right" dir="rtl"> 
<h2>تمرین سری چهارم</h2>
<h3>مثبت منفی بودن خبر</h3>

این notebook مربوط به تسک طبقه‌بندی اخبار بر اساس احساس خبر به سه کلاس مثبت، منفی و خنثی روی دیتاست downsampleشده‌ی دیتاست اصلی است که در آن از هر کلاس، 400 داده در اختیار داریم. 
مشابه همین روند برای دیتاست back_tran_fa_en_fa نیز انجام شده است که از حیث پیاده‌سازی هیچ تفاوتی با این نسخه نداشته و فقط داده‌ی آن تغییر یافته است.
کد مربوط به دیتاست مذکور نیز در notebookی به نام upsampled.ipynb قابل مشاهده است.
</div>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [None]:
!pip install tqdm

In [None]:
import pandas as pd
import csv
import re
import json
import os
import numpy as np

import collections
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

<div style="text-align: right" dir="rtl"> 
<h2>آماده‌سازی داده</h2>
در بخش آماده‌سازی داده‌ها به موارد زیر پرداخته می‌شود:
<ol>
  <li>خواندن داده‌ها</li>
  <li>استخراج تگ نهایی برای داده‌های eval و test</li>
  <li>پیش‌پردازش داده‌ها</li>
  <li>آماده کردن داده‌ها به فرمتی که مورد نیاز مدل است</li>
</ol>
</div>

In [None]:
with open(f'/content/drive/MyDrive/dataset_annotated_sentiment.json', encoding='utf-8') as fh:
        data = json.load(fh)

In [None]:
with open(f'/content/drive/MyDrive/down_sampled.json', encoding='utf-8') as fh:
        downsampled_data = json.load(fh)

In [None]:
persian_to_english = {
    '۰': '0',
    '۱': '1',
    '۲': '2',
    '۳': '3',
    '۴': '4',
    '٤': '4',
    '۵': '5',
    '٥': '5',
    '۶': '6',
    '٦': '6',
    '۷': '7',
    '۸': '8',
    '۹': '9'
}
def preprocess(text):
    #removing url
    url_re = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_re, '', text)
    #removing _ character
    text = text.replace("_","")
    #removing " character
    text = text.replace("\"","")
    #removing « character
    text = text.replace("«","")
    #removing »  character
    text = text.replace("»","")    
    #removing latin characters and tab
    text = re.sub("([A-Za-z\t])", "", text)
    #removing repeating ـ which is often used for keshidan like تـــابع
    text = re.sub('ـ*', '', text)
    #removing references like [۱]
    text = re.sub("\[[۱۲۳۴۵۶۷۸۹۰1234567890]*\]"," ", text)
    #converting persian and arabic numbers to english
    for p in persian_to_english.keys():
        text = text.replace(p,persian_to_english[p])
    return text




In [None]:
def get_eval_final_tag():
    not_tagged = 0
    eval_data = []
    for news in data["eval"]:
        s = 0
        if news["annotations"] == []:
            not_tagged +=1
            continue   
        news["text"] = preprocess(news["text"])
        for annotation in news["annotations"]:
            if annotation == 'خنثی':
              s+=0
            elif annotation == 'منفی':
              s+=-1
            else:
              s+=1
        if s < 0:
          tag = 'منفی'
        elif s ==0 :
          tag = 'خنثی'
        else:
          tag = 'مثبت'

        eval_data.append((news["text"],news["annotations"],tag))
        
    return eval_data, not_tagged

In [None]:
eval_data, not_tagged = get_eval_final_tag()

In [None]:
def get_test_final_tag():
    not_tagged = 0
    test_data = []
    for news in data["test"]:
        s = 0
        if news["annotations"] == []:
            not_tagged +=1
            continue   
        news["text"] = preprocess(news["text"])
        for annotation in news["annotations"]:
            if annotation == 'خنثی':
              s+=0
            elif annotation == 'منفی':
              s+=-1
            else:
              s+=1
        if s < 0:
          tag = 'منفی'
        elif s ==0 :
          tag = 'خنثی'
        else:
          tag = 'مثبت'

        test_data.append((news["text"],news["annotations"],tag))
        
    return test_data, not_tagged

In [None]:
test_data, not_tagged = get_test_final_tag()

In [None]:
downsampled_train_data = []
for news in downsampled_data["train"]:
    news["text"] = preprocess(news["text"])
    downsampled_train_data.append((news["text"], news["annotations"], news["fa"]))

In [None]:
train_header = ['text', 'annotations', 'final_tag']  
with open('downsampled_train_data.csv', 'w', newline='', encoding='utf-8') as f: 
    write = csv.writer(f) 
    write.writerow(train_header) 
    write.writerows(downsampled_train_data) 

In [None]:
eval_header = ['text', 'annotations', 'final_tag']  

with open('eval_data.csv', 'w', newline='', encoding='utf-8') as f: 
    write = csv.writer(f) 
    write.writerow(eval_header) 
    write.writerows(eval_data) 

In [None]:
test_header = ['text', 'annotations', 'final_tag']  

with open('test_data.csv', 'w', newline='', encoding='utf-8') as f: 
    write = csv.writer(f) 
    write.writerow(test_header) 
    write.writerows(test_data) 

In [None]:
downsampled_df = pd.read_csv('downsampled_train_data.csv')
downsampled_df.head()

Unnamed: 0,text,annotations,final_tag
0,آتش سوزی کارخانه جمیل نخ گسترده است / اعزام نی...,"['منفی', 'منفی', 'منفی']",منفی
1,تجهیزات باکیفیت صنعتی آشپزخانه و کافی شاپ\n\nب...,"['مثبت', 'مثبت', 'خنثی']",مثبت
2,برگزاری جشنواره رسانه ابوذر در دی ماه سال جاری...,"['مثبت', 'خنثی', 'خنثی']",مثبت
3,افزایش 80 درصدی فروش سلاح در آمریکا در ژانویه ...,"['منفی', 'منفی', 'منفی']",منفی
4,بازداشت دو تروریست که در بمب گذاری مسیر زائران...,"['خنثی', 'مثبت']",مثبت


In [None]:
eval_df = pd.read_csv('eval_data.csv')
eval_df.head()

Unnamed: 0,text,annotations,final_tag
0,نجات از عوارض غذا های فرآوری شده با امگا 3؟\n\...,"['خنثی', 'منفی', 'خنثی']",منفی
1,معاون رئیس جمهور: دولت از ظرفیت نهادهای انقلاب...,"['مثبت', 'مثبت', 'مثبت']",مثبت
2,شرکت های دانش بنیان ضربات اقتصادی همه گیری کرو...,"['مثبت', 'مثبت', 'مثبت']",مثبت
3,الهام علی اف: ایران کشور دوست و برادر جمهوری...,"['مثبت', 'مثبت', 'مثبت']",مثبت
4,مرگ یک کارگر چاه کن بر اثر ریزش چاه\n\nبه گزار...,"['منفی', 'منفی', 'منفی']",منفی


In [None]:
test_df = pd.read_csv('test_data.csv')
test_df.head()

Unnamed: 0,text,annotations,final_tag
0,رکوردشکنی رمزارز دوم\n\nجریان مداوم اخبار دربا...,"['مثبت', 'خنثی', 'مثبت']",مثبت
1,نشست گستره قلمرو و حکم حاکم در مذهب امامیه و ح...,"['خنثی', 'مثبت', 'خنثی']",مثبت
2,اعلام آمادگی دفتر آیت الله سیستانی برای کمک به...,"['خنثی', 'خنثی']",خنثی
3,چشم جهان نمای ایران با منظومه سازی ماهواره ها ...,"['مثبت', 'مثبت']",مثبت
4,اجرای 2 پروژه از طرح های پتروپالایش در خوزستان...,"['مثبت', 'مثبت', 'مثبت']",مثبت


In [None]:
downsampled_df_positive = downsampled_df[downsampled_df['final_tag']=='مثبت']
downsampled_df_neutral = downsampled_df[downsampled_df['final_tag']=='خنثی']
downsampled_df_negative = downsampled_df[downsampled_df['final_tag']=='منفی']


<div style="text-align: right" dir="rtl"> 
می‌بینیم که برای هر یک از کلاس‌های مثبت، منفی و خنثی 400 نمونه داریم.
</div>

In [None]:
print(len(downsampled_df_positive),len(downsampled_df_neutral),len(downsampled_df_negative))

400 400 400


In [None]:
downsampled_train_df = pd.concat([downsampled_df_negative, downsampled_df_positive, downsampled_df_neutral])
downsampled_train_df = downsampled_df.sample(frac=1).reset_index(drop=True)
downsampled_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         1200 non-null   object
 1   annotations  1200 non-null   object
 2   final_tag    1200 non-null   object
dtypes: object(3)
memory usage: 28.2+ KB


In [None]:
downsampled_train_df.head()
downsampled_train_df.to_csv('downsampled_balanced_data.csv', encoding='utf-8')


In [None]:
x_train, y_train = downsampled_train_df['text'].values.tolist(), downsampled_df['final_tag'].values.tolist()
x_valid, y_valid = eval_df['text'].values.tolist(), eval_df['final_tag'].values.tolist()
x_test, y_test = test_df['text'].values.tolist(), test_df['final_tag'].values.tolist()

In [None]:
print(downsampled_df.shape)
print(eval_df.shape)
print(test_df.shape)

(1200, 3)
(75, 3)
(75, 3)


<div style="text-align: right" dir="rtl"> 
<h2>پیاده‌سازی مدل</h2>
کد پایه‌ای که برای پیاده‌سازی این مدل استفاده شده است، فایل notebookیست که در بخش NLP Task Tutorials گیت‌هاب مدل parsbert موجود است. 
    (https://github.com/hooshvare/parsbert#nlp-tasks-tutorial--hugs)
    <div>
    لینک کد:
    https://colab.research.google.com/github/hooshvare/parsbert/blob/master/notebooks/Taaghche_Sentiment_Analysis.ipynb
    </div>
در این پیاده‌سازی از کتابخانه‌ی transformers و فریم‌ورک pytorch استفاده شده است.
</div>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cuda:0
CUDA is available!  Training on GPU ...


In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 1e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-hw4/pytorch_model.bin'
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {'منفی': 0, 'مثبت': 1, 'خنثی':2}
id2label = {0: 'منفی', 1: 'مثبت', 2: 'خنثی'}



In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440 [00:00<?, ?B/s]

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "\u0645\u0646\u0641\u06cc",
    "1": "\u0645\u062b\u0628\u062a",
    "2": "\u062e\u0646\u062b\u06cc"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "\u062e\u0646\u062b\u06cc": 2,
    "\u0645\u062b\u0628\u062a": 1,
    "\u0645\u0646\u0641\u06cc": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



In [None]:
idx = np.random.randint(0, len(downsampled_train_df))
sample_text = downsampled_train_df.iloc[idx]['text']
sample_tag = downsampled_train_df.iloc[idx]['final_tag']

print(f'Sample: \n{sample_text}\n{sample_tag}')

Sample: 
طرز تهیه پد تای، غذای معروف تایلندی

پد تای که اکثر گردشگران طعم آن را امتحان کرده اند، با روش و مواد مختلف و ساده تهیه و آماده می شود. میگو: 12 عددبادام زمینی: 20 عددتمرهندی: 4 بستهتخم مرغ: 1 عددپیاز: 1 عددجوانه ماش: 1 لیوانفتوچینی پخته: 3 لیوانسس سویا: 3 قاشق غذاخوریساقه سیر: 3 لیوانشکر: 2 قاشق غذاخوریآب لیموترش: 3 قاشق غذاخوریروغن: 2 قاشق غذاخوریفلفل قرمز: به مقدار لازم
خنثی


In [None]:
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'  Text: {sample_text}\n')
print(f'   Tokens: {tokenizer.convert_tokens_to_string(tokens)}\n')
print(f'Token IDs: {token_ids}')

  Text: طرز تهیه پد تای، غذای معروف تایلندی

پد تای که اکثر گردشگران طعم آن را امتحان کرده اند، با روش و مواد مختلف و ساده تهیه و آماده می شود. میگو: 12 عددبادام زمینی: 20 عددتمرهندی: 4 بستهتخم مرغ: 1 عددپیاز: 1 عددجوانه ماش: 1 لیوانفتوچینی پخته: 3 لیوانسس سویا: 3 قاشق غذاخوریساقه سیر: 3 لیوانشکر: 2 قاشق غذاخوریآب لیموترش: 3 قاشق غذاخوریروغن: 2 قاشق غذاخوریفلفل قرمز: به مقدار لازم

   Tokens: طرز تهیه پد تای ، غذای معروف تایلندی پد تای که اکثر گردشگران طعم ان را امتحان کرده اند ، با روش و مواد مختلف و ساده تهیه و اماده می شود . میگو : [UNK] عددبادام زمینی : [UNK] عددتمرهندی : [UNK] بستهتخم مرغ : [UNK] عددپیاز : [UNK] عددجوانه ماش : [UNK] لیوانفتوچینی پخته : [UNK] لیوانسس سویا : [UNK] قاشق غذاخوریساقه سیر : [UNK] لیوانشکر : [UNK] قاشق غذاخوریاب لیموترش : [UNK] قاشق غذاخوریروغن : [UNK] قاشق غذاخوریفلفل قرمز : به مقدار لازم

Token IDs: [8274, 4211, 12223, 4075, 1348, 7037, 4387, 17245, 12223, 4075, 2800, 5378, 6679, 7773, 2808, 2803, 7216, 3027, 3145, 1348, 2799, 3541, 1379, 3725, 3431, 1

In [None]:
encoding = tokenizer.encode_plus(
    sample_text,
    max_length=32,
    truncation=True,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt',  # Return PyTorch tensors
)

print(f'Keys: {encoding.keys()}\n')
for k in encoding.keys():
    print(f'{k}:\n{encoding[k]}')

Keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

input_ids:
tensor([[    2,  8274,  4211, 12223,  4075,  1348,  7037,  4387, 17245, 12223,
          4075,  2800,  5378,  6679,  7773,  2808,  2803,  7216,  3027,  3145,
          1348,  2799,  3541,  1379,  3725,  3431,  1379,  4613,  4211,  1379,
          4788,     4]])
token_type_ids:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
class newsDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for news. """

    def __init__(self, tokenizer, texts, targets=None, label_list=None, max_len=128):
        self.texts = texts
        self.targets = targets
        self.has_target = isinstance(targets, list) or isinstance(targets, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len

        
        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])

        if self.has_target:
            target = self.label_map.get(str(self.targets[item]), str(self.targets[item]))

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')
        
        inputs = {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_target:
            inputs['targets'] = torch.tensor(target, dtype=torch.long)
        
        return inputs


def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = newsDataset(
        texts=x,
        targets=y,
        tokenizer=tokenizer,
        max_len=max_len, 
        label_list=label_list)
    
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [None]:
label_list = ['منفی', 'مثبت','خنثی']
train_data_loader = create_data_loader(downsampled_train_df['text'].to_numpy(), downsampled_train_df['final_tag'].to_numpy(), tokenizer, MAX_LEN, TRAIN_BATCH_SIZE, label_list)
valid_data_loader = create_data_loader(eval_df['text'].to_numpy(), eval_df['final_tag'].to_numpy(), tokenizer, MAX_LEN, VALID_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test_df['text'].to_numpy(), None, tokenizer, MAX_LEN, TEST_BATCH_SIZE, label_list)

In [None]:
class SentimentModel(nn.Module):

    def __init__(self, config):
        super(SentimentModel, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH,return_dict=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids)
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits 

In [None]:
pt_model = SentimentModel(config=config)
pt_model = pt_model.to(device)

print('pt_model', type(pt_model))

Downloading:   0%|          | 0.00/624M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


pt_model <class '__main__.SentimentModel'>


In [None]:
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def simple_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = simple_accuracy(y_true, y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss


def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):
            
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['targets']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            # calculate the batch loss
            loss = loss_fn(outputs, targets)

            # accumulate all the losses
            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)
    
    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss


def train_op(model, 
             data_loader, 
             loss_fn, 
             optimizer, 
             scheduler, 
             step=0, 
             print_every_step=100, 
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None, 
             clip=0.0):
    
    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1

        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['targets']

        # move tensors to GPU if CUDA is available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        # clear the gradients of all optimized variables
        optimizer.zero_grad()

        # compute predicted outputs by passing inputs to the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)
        
        # convert output probabilities to predicted class
        _, preds = torch.max(outputs, dim=1)

        # calculate the batch loss
        loss = loss_fn(outputs, targets)

        # accumulate all the losses
        losses.append(loss.item())

        # compute gradient of the loss with respect to model parameters
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        # perform optimization step
        optimizer.step()

        # perform scheduler step
        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        if eval:
            train_y, train_loss = y_loss(y_true, y_pred, losses)
            train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

            if step % print_every_step == 0:
                eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
                eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

                if hasattr(eval_cb, '__call__'):
                    eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step, eval_loss_min

In [None]:
import collections
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
optimizer = AdamW(pt_model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)


def eval_callback(epoch, epochs, output_path):
    def eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min):
        statement = ''
        statement += 'Epoch: {}/{}...'.format(epoch, epochs)
        statement += 'Step: {}...'.format(step)
        
        statement += 'Train Loss: {:.6f}...'.format(train_loss)
        statement += 'Train Acc: {:.3f}...'.format(train_score['acc'])

        statement += 'Valid Loss: {:.6f}...'.format(eval_loss)
        statement += 'Valid Acc: {:.3f}...'.format(eval_score['acc'])

        print(statement)

        if eval_loss <= eval_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                eval_loss_min,
                eval_loss))
            
            torch.save(model.state_dict(), output_path)
            eval_loss_min = eval_loss
        
        return eval_loss_min


    return eval_cb


for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step, eval_loss_min = train_op(
        model=pt_model, 
        data_loader=train_data_loader, 
        loss_fn=loss_fn, 
        optimizer=optimizer, 
        scheduler=scheduler, 
        step=step, 
        print_every_step=EEVERY_EPOCH, 
        eval=True,
        eval_cb=eval_callback(epoch, EPOCHS, OUTPUT_PATH),
        eval_loss_min=eval_loss_min,
        eval_data_loader=valid_data_loader, 
        clip=CLIP)
    
    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')
    
    eval_y, eval_loss = eval_op(
        model=pt_model, 
        data_loader=valid_data_loader, 
        loss_fn=loss_fn)
    
    eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')
    
    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    history['val_acc'].append(eval_score['acc'])
    history['val_loss'].append(eval_loss)

Epochs... :   0%|          | 0/3 [00:00<?, ?it/s]
Training... :   0%|          | 0/75 [00:00<?, ?it/s][A
Training... :   1%|▏         | 1/75 [00:00<00:42,  1.75it/s][A
Training... :   3%|▎         | 2/75 [00:00<00:33,  2.15it/s][A
Training... :   4%|▍         | 3/75 [00:01<00:30,  2.34it/s][A
Training... :   5%|▌         | 4/75 [00:01<00:30,  2.35it/s][A
Training... :   7%|▋         | 5/75 [00:02<00:28,  2.43it/s][A
Training... :   8%|▊         | 6/75 [00:02<00:28,  2.42it/s][A
Training... :   9%|▉         | 7/75 [00:02<00:27,  2.48it/s][A
Training... :  11%|█         | 8/75 [00:03<00:26,  2.50it/s][A
Training... :  12%|█▏        | 9/75 [00:03<00:26,  2.51it/s][A
Training... :  13%|█▎        | 10/75 [00:04<00:25,  2.53it/s][A
Training... :  15%|█▍        | 11/75 [00:04<00:25,  2.52it/s][A
Training... :  16%|█▌        | 12/75 [00:04<00:24,  2.53it/s][A
Training... :  17%|█▋        | 13/75 [00:05<00:24,  2.52it/s][A
Training... :  19%|█▊        | 14/75 [00:05<00:24,  2.50it

In [None]:
def predict(model, comments, tokenizer, max_len=128, batch_size=32):
    data_loader = create_data_loader(comments, None, tokenizer, max_len, batch_size, None)
    
    predictions = []
    prediction_probs = []

    
    model.eval()
    with torch.no_grad():
        for dl in tqdm(data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            
            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [None]:
test_comments = test_df['text'].to_numpy()
preds, probs = predict(pt_model, test_comments, tokenizer, max_len=128)

print(preds.shape, probs.shape)

100%|██████████| 3/3 [00:00<00:00,  5.30it/s]

(75,) (75, 3)





<div style="text-align: right" dir="rtl"> 
<h2>تست</h2>
تست مدل روی داده‌ی تست و نتایج حاصل از آن از جمله f1 score و confusion matrix در زیر قابل مشاهده است.
</div>

In [None]:
y_test, y_pred = [label_list.index(label) for label in test_df['final_tag'].values], preds

print(f'F1: {f1_score(y_test, y_pred, average="weighted")}')
print()
print(classification_report(y_test, y_pred, target_names=label_list))

F1: 0.7357351770375027

              precision    recall  f1-score   support

        منفی       0.62      0.79      0.70        19
        مثبت       0.85      0.76      0.80        37
        خنثی       0.67      0.63      0.65        19

    accuracy                           0.73        75
   macro avg       0.71      0.73      0.72        75
weighted avg       0.75      0.73      0.74        75



In [None]:
label_list

['منفی', 'مثبت', 'خنثی']

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[15,  2,  2],
       [ 5, 28,  4],
       [ 4,  3, 12]])