In [None]:
import pandas as pd
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaTokenizer,XLMRobertaTokenizer  ,AutoConfig,AutoTokenizer,AutoModel
from transformers import XLMRobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
class XLMRobertaClassifier(nn.Module):
  def __init__(self, model_name, num_classes):
    super(XLMRobertaClassifier, self).__init__()
    self.bert = XLMRobertaModel.from_pretrained(model_name)
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    x = self.dropout(pooled_output)
    logits = self.fc(x)
    return logits

In [None]:
! git clone https://github.com/Arman-Rayan-Sharif/arman-text-emotion.git

Cloning into 'arman-text-emotion'...
remote: Enumerating objects: 52, done.[K
remote: Counting objects: 100% (52/52), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 52 (delta 10), reused 9 (delta 2), pack-reused 0[K
Receiving objects: 100% (52/52), 572.39 KiB | 20.44 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [None]:
 pip install hazm

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl (892 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.6/892.6 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_6

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from hazm import Normalizer
normalizer = Normalizer(persian_numbers=False,persian_style=False)

In [None]:
import joblib
# joblib.dump(xlm_roberta_model_instance, '/content/drive/MyDrive/model_proj.joblib')
loaded_model = joblib.load('/content/drive/MyDrive/model_proj.joblib')

In [None]:
label_Dict= {
  'OTHER': 0,
  'HAPPY': 1,
  'SURPRISE': 2,
  'FEAR': 3,
  'HATE': 4,
  'ANGRY': 5,
  'SAD': 6,
}
reversed_label_Dict = {
  0: 'OTHER',
  1: 'HAPPY',
  2: 'SURPRISE',
  3: 'FEAR',
  4: 'HATE',
  5: 'ANGRY',
  6: 'SAD',
}
Tokenizer= XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [None]:
def predict_emoji(texts, model, tokenizer= Tokenizer, label_dict= reversed_label_Dict, max_length=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## if you can use cuda(with gpu) use it, otherwise use cpu
    model.eval()
    num_classes = len(label_dict)
    preds = torch.zeros(num_classes).to(device)
    linear = torch.nn.Linear(model.fc.in_features, num_classes).to(device)
    predicts = []
    predict_codes = []

    for text in tqdm(texts,position=0):

        encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            preds += torch.sigmoid(logits).sum(dim=0)

        preds /= len(texts)
        preds = F.softmax(preds, dim=0)
        _, label_idx = torch.max(preds, dim=0)
        label = label_dict[label_idx.item()]
        predicts.append(label)
        predict_codes.append(label_idx.item())

    return predicts , predict_codes


In [None]:
def predict(path, from_idx =0 , to_idx=-1):
    test_df = pd.read_table(path, header=None)
    # test_df = pd.read_csv(path, header=None)
    test_df[1] = test_df[1].map(label_Dict)
    test_texts, test_labels = test_df[0], test_df[1]
    test_texts=test_texts.map(normalizer.normalize)
    data = {'texts': test_texts, 'labels': test_labels,}
    if to_idx ==-1:
      predicts, predict_codes = predict_emoji(texts=data['texts'].tolist()[from_idx:], model=loaded_model)
    else:
      predicts, predict_codes = predict_emoji(texts=data['texts'].tolist()[from_idx:to_idx], model=loaded_model)

    temp = []
    for i, predict in enumerate(predicts):
      # print(i,', predict: ', predict,', label: ', reversed_label_Dict[data['labels'][i]],', text: ', data['texts'][i])
      temp.append([predict,reversed_label_Dict[data['labels'][i]],data['texts'][i]])
    df = pd.DataFrame(temp, columns=['Predict', 'Label', 'Text'])
    table = df.to_string()
    print()
    print(table)
    return predicts, predict_codes


In [None]:
def evaluate(path, from_idx =0 , to_idx=-1):
    test_df = pd.read_table(path, header=None)
    # test_df = pd.read_csv(path, header=None)

    test_df[1] = test_df[1].map(label_Dict)
    test_texts, test_labels = test_df[0], test_df[1]
    test_texts=test_texts.map(normalizer.normalize)
    data = {'texts': test_texts, 'labels': test_labels,}
    if to_idx ==-1:
      predicts, predict_codes = predict_emoji(texts=data['texts'].tolist()[from_idx:], model=loaded_model)
      score = accuracy_score(data['labels'].tolist()[from_idx:], predict_codes)
      report = classification_report(data['labels'].tolist()[from_idx:], predict_codes)
    else:
      predicts, predict_codes = predict_emoji(texts=data['texts'].tolist()[from_idx:to_idx], model=loaded_model)
      score = accuracy_score(data['labels'].tolist()[from_idx:to_idx], predict_codes)
      report = classification_report(data['labels'].tolist()[from_idx:to_idx], predict_codes)
    print()
    print('accuracy: ', score)
    print(report)
    return score, report

In [None]:
# Path = '/content/arman-text-emotion/dataset/test.tsv'
Path ='/content/Project_Test - testset.tsv'
From_idx = 0
To_idx = -1

In [None]:
predicts, predict_codes = predict(Path,From_idx,To_idx)

100%|██████████| 33/33 [00:02<00:00, 12.08it/s]


     Predict     Label                                                                                                                                                                                                                                                                                          Text
0      HAPPY     HAPPY                                                                                                                                                                                                                                                            این فیلم عالی بود، حس خوبی بهم داد
1       FEAR      FEAR                                                                                                                                                                                                                                                                 دلم برای این موقعیت نگران است
2      HAPPY     HAPPY                                                  




In [None]:
score, report = evaluate(Path,From_idx,To_idx)

100%|██████████| 33/33 [00:00<00:00, 38.77it/s]


accuracy:  0.7878787878787878
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.89      0.89      0.89         9
           2       0.75      0.75      0.75         4
           3       1.00      1.00      1.00         2
           5       1.00      0.40      0.57         5
           6       0.62      1.00      0.76         8

    accuracy                           0.79        33
   macro avg       0.88      0.77      0.79        33
weighted avg       0.85      0.79      0.78        33




