In [None]:
!pip install datasets
!pip install evaluate
!pip install accelerate -U
!pip install transformers[torch]

In [1]:
from transformers import AutoModel , AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
model = (AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv02", num_labels=11).to("cuda"))

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [2]:
import pandas as pd
import nltk
from pyarabic.araby import strip_tashkeel, strip_harakat, strip_lastharaka, strip_tatweel, normalize_hamza
import re

def delete_links(input_text):
    pettern  = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    out_text = re.sub(pettern, ' ', input_text)
    return out_text

def delete_repeated_characters(input_text):
    pattern  = r'(.)\1{2,}'
    out_text = re.sub(pattern, r"\1\1", input_text)
    return out_text

def replace_letters(input_text):
    replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
    replace = dict((re.escape(k), v) for k, v in replace.items())
    pattern = re.compile("|".join(replace.keys()))
    out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
    return out_text

def clean_text(input_text):
    replace = r'[/(){}\[\]|@âÂ,;\?\'\"\*…؟–’،!&\+-:؛-]'
    out_text = re.sub(replace, " ", input_text)
    words = nltk.word_tokenize(out_text)
    out_text = ' '.join(words)
    return out_text

def remove_vowelization(input_text):
    vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    out_text = re.sub(vowelization, '', input_text)
    return out_text

def delete_stopwords(input_text):
    stop_words = set(nltk.corpus.stopwords.words("arabic") + ['خلال' , 'الى' , 'ان' , 'او' , 'انه'])
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    out_text = [w for w in tokens if not w in stop_words]
    out_text = ' '.join(out_text)
    return out_text


# improved the rouge L
def preprocess_text(text):
    text = delete_links(text)
    text = delete_repeated_characters(text)
    text = strip_tashkeel(text)
    text = strip_tatweel(text)
    text= clean_text(text) 
    text = remove_vowelization(text)
    text = replace_letters(text)
    text = delete_stopwords(text)
    return text


In [3]:
import glob
import pandas as pd


file_pattern = f"/kaggle/input/hespress/stories*.csv"
csv_files = glob.glob(file_pattern)

dataframes = []

for file in csv_files:
    df = pd.read_csv(file )
    dataframes.append(df)
    

for df in dataframes:
    df.story = df.story.apply(preprocess_text)
    
for i , df in enumerate(dataframes):
    df.drop(['Unnamed: 0' , 'id' , 'title' , 'date' , 'author' ] , axis = 1 , inplace=  True)
    df['topic'] = i
    
train_sets = []
test_sets = []

for df in dataframes:
    train_sets.append(df[:800])
    test_sets.append(df[800:])

In [4]:
train_set = pd.concat(train_sets)
test_set = pd.concat(test_sets)

In [5]:
file_pattern = f"/kaggle/input/hespress/stories*.csv"
csv_files = glob.glob(file_pattern)

labels = {}
for i,file in enumerate(csv_files):
    labels[file.split('/')[-1][:-4]] = i

In [6]:
from datasets import Dataset
train_set = Dataset.from_pandas(train_set)
test_set = Dataset.from_pandas(test_set)

In [39]:
import torch
class AraBertDataset(torch.utils.data.Dataset):
   def __init__(self, texts, labels=None):
       self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=512)
       self.labels = labels
    
   def __getitem__(self, idx):
       item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
       if self.labels:
           item["labels"] = torch.tensor(self.labels[idx])
       return item

   def __len__(self):
       return len(self.encodings["input_ids"])

In [8]:
train_dataset = AraBertDataset(train_set["story"], train_set['topic'])
test_dataset = AraBertDataset(test_set["story"], test_set['topic'])

In [27]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
def compute_metrics(p):
   pred, labels = p
   pred = np.argmax(pred, axis=1)
   accuracy = accuracy_score(y_true=labels, y_pred=pred)
   recall = recall_score(y_true=labels, y_pred=pred, pos_label='positive', average='weighted')
   precision = precision_score(y_true=labels, y_pred=pred, pos_label='positive', average='weighted')
   f1 = f1_score(y_true=labels, y_pred=pred, pos_label='positive', average='weighted')
   return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [31]:
from transformers import Trainer, TrainingArguments

batch_size = 16

args = TrainingArguments(
  output_dir="output",
  evaluation_strategy="steps",
  eval_steps=200,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  num_train_epochs=1,)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [32]:
from transformers import Trainer
trainer = Trainer(
  model=model,
  args=args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics
)

In [33]:
trainer.train()
# res = trainer.evaluate()

***** Running training *****
  Num examples = 8800
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 550
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
200,No log,0.642451,0.855,0.856851,0.855,0.853789
400,No log,0.512293,0.861364,0.860911,0.861364,0.859744


***** Running Evaluation *****
  Num examples = 2200
  Batch size = 16
***** Running Evaluation *****
  Num examples = 2200
  Batch size = 16
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=550, training_loss=0.28114410053599964, metrics={'train_runtime': 543.8838, 'train_samples_per_second': 16.18, 'train_steps_per_second': 1.011, 'total_flos': 2315564386713600.0, 'train_loss': 0.28114410053599964, 'epoch': 1.0})

In [None]:
topics = []
for df in test_sets:
    topic = df.topic[800]
    df = Dataset.from_pandas(df)
    test_df = AraBertDataset(df["story"], df['topic'])
    trainer = Trainer(
      model=model,
      args=args,
      train_dataset=train_dataset,
      eval_dataset=test_df,
      compute_metrics=compute_metrics
    )
    topics.append((trainer.evaluate() , topic))

In [46]:
topics

[({'eval_loss': 0.11061007529497147,
   'eval_accuracy': 0.97,
   'eval_precision': 1.0,
   'eval_recall': 0.97,
   'eval_f1': 0.9847715736040609,
   'eval_runtime': 3.4874,
   'eval_samples_per_second': 57.349,
   'eval_steps_per_second': 3.728},
  0),
 ({'eval_loss': 0.11591026932001114,
   'eval_accuracy': 0.965,
   'eval_precision': 1.0,
   'eval_recall': 0.965,
   'eval_f1': 0.9821882951653944,
   'eval_runtime': 3.4314,
   'eval_samples_per_second': 58.285,
   'eval_steps_per_second': 3.789},
  1),
 ({'eval_loss': 0.47806477546691895,
   'eval_accuracy': 0.865,
   'eval_precision': 1.0,
   'eval_recall': 0.865,
   'eval_f1': 0.9276139410187667,
   'eval_runtime': 3.4462,
   'eval_samples_per_second': 58.034,
   'eval_steps_per_second': 3.772},
  2),
 ({'eval_loss': 0.10588047653436661,
   'eval_accuracy': 0.985,
   'eval_precision': 1.0,
   'eval_recall': 0.985,
   'eval_f1': 0.9924433249370278,
   'eval_runtime': 3.4462,
   'eval_samples_per_second': 58.035,
   'eval_steps_per_s

In [44]:
topics

[({'eval_loss': 0.11061007529497147,
   'eval_accuracy': 0.97,
   'eval_precision': 1.0,
   'eval_recall': 0.97,
   'eval_f1': 0.9847715736040609,
   'eval_runtime': 3.4874,
   'eval_samples_per_second': 57.349,
   'eval_steps_per_second': 3.728},
  0),
 ({'eval_loss': 0.11591026932001114,
   'eval_accuracy': 0.965,
   'eval_precision': 1.0,
   'eval_recall': 0.965,
   'eval_f1': 0.9821882951653944,
   'eval_runtime': 3.4314,
   'eval_samples_per_second': 58.285,
   'eval_steps_per_second': 3.789},
  1),
 ({'eval_loss': 0.47806477546691895,
   'eval_accuracy': 0.865,
   'eval_precision': 1.0,
   'eval_recall': 0.865,
   'eval_f1': 0.9276139410187667,
   'eval_runtime': 3.4462,
   'eval_samples_per_second': 58.034,
   'eval_steps_per_second': 3.772},
  2),
 ({'eval_loss': 0.10588047653436661,
   'eval_accuracy': 0.985,
   'eval_precision': 1.0,
   'eval_recall': 0.985,
   'eval_f1': 0.9924433249370278,
   'eval_runtime': 3.4462,
   'eval_samples_per_second': 58.035,
   'eval_steps_per_s