In [None]:
!pip install transformers
!pip install hazm
from hazm import *
import pickle
import transformers
from transformers import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, roc_auc_score
import math
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import json
from copy import deepcopy
import numpy as np
import random
import re
import string
import codecs
from shutil import copyfile
random.seed(12345)
label_encoder = preprocessing.LabelEncoder()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
news = pd.read_csv('classification.csv')
news.dropna()
news

Unnamed: 0,sentence,label
0,دستور برای لغو حمله‌ی فردا صبح,رسمی
1,تو که نمی دونی تو زورکی به زبون ما حرف میزنی,محاوره
2,پس حالا همه چی به تو محول شده؟,محاوره
3,درست میگم ستوان؟ بله قربان همینطوره,رسمی
4,و گردان دوم دوون دارن به سمت این نقطه پیشرو...,محاوره
...,...,...
19995,قربان آنجا دست آلمانی هاست,رسمی
19996,چون که چندشه ماله منه,محاوره
19997,باید تا تموم شدن موج اول صبر کنی,محاوره
19998,اگه بگم آره نمیدونم که ما از روی عشق,محاوره


In [None]:
class Preprocessing:

  @staticmethod
  def remove_punctuations(text):
    new_text = []
    for l in text:
      if l not in string.punctuation + '\u00AB' + '\u00BB' + '\u060C' + '\u061B' + '\u061F':
        new_text.append(l)
      else:
        new_text.append(' ')
    return ''.join(new_text)
  
  @staticmethod
  def remove_numbers(text):
    new_text = []
    for l in text:
      if l not in '0123456789۰۱۲۳۴۵۶۷۸۹':
        new_text.append(l)
      else:
        new_text.append(' ')
    return ''.join(new_text)

  @staticmethod
  def remove_extra_space(text):
    new_text = re.sub(r'\s+',' ',text)
    return new_text

# a class to hold our data structure
class Data:
  def __init__(self, data, preprocessing=False):
    self.text = data['sentence']
    self.category = data['label']
    self.label = label_encoder.transform([data['label']])[0]

# label encoder
all_labels = ['رسمی','محاوره']
label_encoder.fit(all_labels)
classes = label_encoder.classes_

# convert raw data into Data objects
news_data = []
for index, data in news.iterrows():
  news_data.append(Data(data, True))

In [None]:
def split_data(data, portions=[.6,.2,.2]):
  random.shuffle(data)
  data_len = len(data)
  train_data = data[:int(portions[0] * data_len)]
  eval_data = data[int(portions[0] * data_len):int((portions[0] + portions[1]) * data_len)]
  test_data = data[int((portions[0] + portions[1]) * data_len):]
  return train_data, eval_data, test_data
# a function to get a portion of data with acquired preprocessings
def get_data(dataset, preprocess={
    'remove_stopwords': True,
    'remove_punctuations': True,
    'remove_numbers': True}):
  random.shuffle(dataset)
  new_dataset = []

  # applying preprocessings on train data
  for i, data in enumerate(tqdm(dataset)):
    if str(data.text) == 'nan':
      continue
    if preprocess['remove_punctuations'] == True:
      dataset[i].text = Preprocessing.remove_punctuations(data.text)
    if preprocess['remove_numbers'] == True:
      dataset[i].text = Preprocessing.remove_numbers(data.text)
    dataset[i].text = Preprocessing.remove_extra_space(data.text)
    new_dataset.append(dataset[i])

  return split_data(new_dataset)

train_data, eval_data, test_data = get_data(news_data, preprocess={
    'remove_punctuations': True,
    'remove_numbers': True})
with open("train.pickle", "wb") as f:
  pickle.dump(train_data, f)
with open("eval_data.pickle", "wb") as f:
  pickle.dump(eval_data, f)
with open("test_data.pickle", "wb") as f:
  pickle.dump(test_data, f)

100%|██████████| 20000/20000 [00:01<00:00, 18434.32it/s]


In [None]:
# loading pars roberta and tokenizer
from transformers import AutoConfig, AutoTokenizer, AutoModel, TFAutoModel
# v3.0
model_name_or_path = "HooshvareLab/bert-fa-zwnj-base"
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
# model = TFAutoModel.from_pretrained(model_name_or_path)  For TF
parsbert = AutoModel.from_pretrained(model_name_or_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/473M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-zwnj-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-base and are newly initialized: ['bert.pooler.dense.weight', 'bert.p

In [None]:
# a class for loading data
class NewsDataloader(Dataset):

  def __init__(self,dataset,tokenizer):
    self.dataset = dataset
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    text_tokens = self.tokenizer.encode_plus(
        str(self.dataset[idx].text),
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_tensors="pt",
        truncation=True
    )

    return [text_tokens, self.dataset[idx].label]

# defining our transformer model
class TransformerModel(nn.Module):

  def __init__(self, bert):
    super(TransformerModel, self).__init__()
    self.bert = bert
    # we only use one linear head on the parsbert
    self.linear_head = nn.Linear(768, len(label_encoder.classes_))

  def forward(self, x):
    # main task
    x = self.bert(x['input_ids'],x['attention_mask'])
    logits = self.linear_head(x.pooler_output)
    return logits

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
# Transformer Training
epochs = 2
lr = 8e-6
device = 'cuda:0'
k_step_loss = 0
k = 50
batch_size = 11
# get train dataloader
train_dataset = NewsDataloader(train_data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# get train dataloader
eval_dataset = NewsDataloader(eval_data, tokenizer)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True)
# get eval dataloader
num_steps = int(np.ceil(len(train_dataloader.dataset) / batch_size))
loss_collection = []
f1_collection = []
eval_f1_micro = []
eval_f1_macro = []
eval_accuracy = []
model = TransformerModel(parsbert).to(device)
loss_fn = nn.CrossEntropyLoss(torch.tensor([.3, 1])).to(device)
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, 
    num_training_steps=epochs * len(train_dataloader.dataset)
)

from tqdm import tqdm
for epoch in tqdm(range(epochs)):
  for step, data in enumerate(train_dataloader):
    input_ids = data[0]["input_ids"].squeeze().to(device)
    attention_mask = data[0]["attention_mask"].squeeze().to(device)
    labels = data[1].to(device)
    feed_dict = {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

    optimizer.zero_grad()
    output = model(feed_dict)
    loss = loss_fn(output, labels)
    # loss scheduler
    loss.backward()
    optimizer.step()
    scheduler.step()
    k_step_loss += loss.item()
    if (step + 1) % k == 0:
      loss_collection.append(k_step_loss)
      print(f'EPOCH {epoch + 1}/{epochs} | STEP [{step + 1}/{num_steps}] | Loss {k_step_loss}')
      k_step_loss = 0

  k_step_loss = 0
  print(f'################## Epoch {epoch + 1} Evaluation ##################')
  with torch.no_grad():
    all_pred = []
    all_label = []
    for j, data in enumerate(eval_dataloader):
      input_ids = data[0]["input_ids"].squeeze().to(device)
      attention_mask = data[0]["attention_mask"].squeeze().to(device)
      labels = data[1].to(device)
      feed_dict = {
          'input_ids': input_ids,
          'attention_mask': attention_mask
      }
      output = model(feed_dict)
      pred = output.argmax(dim=1, keepdim=True)
      all_label.extend(list(labels.cpu().numpy()))
      all_pred.extend(list(pred.view(-1).cpu().numpy()))


    f1_macro = f1_score(all_label, all_pred, average='macro')
    f1_micro = f1_score(all_label, all_pred, average='micro')
    accuracy = accuracy_score(all_label, all_pred)
    recall = recall_score(all_label, all_pred)
    roc_auc = roc_auc_score(all_label, all_pred)

    print(f'ROC-AUC {roc_auc}')
    print(f'F1-macro {f1_macro}')
    print(f'F1-micro {f1_micro}')
    print(f'Accuracy {accuracy}')
    print(f'Recall {recall}')
    f1_collection.append(f1_macro)
    eval_f1_macro.append(f1_macro)
    eval_f1_micro.append(f1_micro)
    eval_accuracy.append(accuracy)




EPOCH 1/2 | STEP [50/1091] | Loss 15.83951485902071
EPOCH 1/2 | STEP [100/1091] | Loss 11.735836260020733
EPOCH 1/2 | STEP [150/1091] | Loss 9.851149681955576
EPOCH 1/2 | STEP [200/1091] | Loss 9.749537782743573
EPOCH 1/2 | STEP [250/1091] | Loss 9.290751338005066
EPOCH 1/2 | STEP [300/1091] | Loss 8.296915283426642
EPOCH 1/2 | STEP [350/1091] | Loss 8.195375287905335
EPOCH 1/2 | STEP [400/1091] | Loss 8.004482726566494
EPOCH 1/2 | STEP [450/1091] | Loss 8.751614678651094
EPOCH 1/2 | STEP [500/1091] | Loss 8.319938362576067
EPOCH 1/2 | STEP [550/1091] | Loss 7.178441652096808
EPOCH 1/2 | STEP [600/1091] | Loss 6.381812838837504
EPOCH 1/2 | STEP [650/1091] | Loss 7.4244587840512395
EPOCH 1/2 | STEP [700/1091] | Loss 7.807592405937612
EPOCH 1/2 | STEP [750/1091] | Loss 7.305850729346275
EPOCH 1/2 | STEP [800/1091] | Loss 6.730348370503634
EPOCH 1/2 | STEP [850/1091] | Loss 5.819150706054643
EPOCH 1/2 | STEP [900/1091] | Loss 7.263721534516662
EPOCH 1/2 | STEP [950/1091] | Loss 6.75377915

 50%|█████     | 1/2 [19:05<19:05, 1145.36s/it]

ROC-AUC 0.881111878772512
F1-macro 0.8808661286464188
F1-micro 0.88275
Accuracy 0.88275
Recall 0.9940857565303105




EPOCH 2/2 | STEP [50/1091] | Loss 5.016316732624546
EPOCH 2/2 | STEP [100/1091] | Loss 5.778187513817102
EPOCH 2/2 | STEP [150/1091] | Loss 6.049368872772902
EPOCH 2/2 | STEP [200/1091] | Loss 6.229437901172787
EPOCH 2/2 | STEP [250/1091] | Loss 5.524201939348131
EPOCH 2/2 | STEP [300/1091] | Loss 6.083549274713732
EPOCH 2/2 | STEP [350/1091] | Loss 5.12982754711993
EPOCH 2/2 | STEP [400/1091] | Loss 4.980200759135187
EPOCH 2/2 | STEP [450/1091] | Loss 6.196510016452521
EPOCH 2/2 | STEP [500/1091] | Loss 5.146400447352789
EPOCH 2/2 | STEP [550/1091] | Loss 6.38922706595622
EPOCH 2/2 | STEP [600/1091] | Loss 4.265706078556832
EPOCH 2/2 | STEP [650/1091] | Loss 5.753321456140839
EPOCH 2/2 | STEP [700/1091] | Loss 5.475281512539368
EPOCH 2/2 | STEP [750/1091] | Loss 5.574459435418248
EPOCH 2/2 | STEP [800/1091] | Loss 5.9286115624709055
EPOCH 2/2 | STEP [850/1091] | Loss 5.423551284940913
EPOCH 2/2 | STEP [900/1091] | Loss 5.209138585079927
EPOCH 2/2 | STEP [950/1091] | Loss 4.90547961858

100%|██████████| 2/2 [38:08<00:00, 1144.10s/it]

ROC-AUC 0.9090094942461653
F1-macro 0.9093758980431156
F1-micro 0.90975
Accuracy 0.90975
Recall 0.9600788565795959



