the trained model is available on: https://huggingface.co/Noorrabie/bert_content

In [None]:
! pip install evaluate datasets

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer , PretrainedConfig, set_seed
import evaluate
import numpy as np
import pandas as pd
import datasets
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score


In [None]:
loss_type = "CE"
d_mat_type = "default" #don't change
n_levels = 13 #don't change

checkpoint = "aubmindlab/bert-base-arabertv02"

data_path = '../data/All_data_1M_morph_clean.csv' # DATA PATH

input_text = "word_sents"

In [None]:
if d_mat_type == "default":
    d_matrix =  [[abs(i-j) for i in range(n_levels)] for j in range(n_levels)]
else:
    d_matrix = [[(abs(i-j)/18)+(abs(barec_7_dict[i+1]-barec_7_dict[j+1])/6)+(abs(barec_5_dict[i+1]-barec_5_dict[j+1])/4)+(abs(barec_3_dict[i+1]-barec_3_dict[j+1])/2) for i in range(19)] for j in range(19)]


In [None]:
losses_dict = {"CE": Trainer,}
loss_function = losses_dict[loss_type]


In [None]:
all_df = pd.read_csv(data_path, header=0)


In [None]:
# Define the RL_num-to-group mapping
group_mapping = {
    1: 1, 2: 1, 3: 1, 4: 1,
    5: 2, 6: 2,
    7: 3,
    8: 4,
    9: 5, 10: 5,
    11: 6, 12: 6,
    13: 7, 14: 7,
    15: 8, 16: 8, 17: 8, 18: 8, 19: 8
}

# Map RL_num to groups
all_df['group'] = all_df['RL_num_19'].map(group_mapping)

In [None]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

real_names = {
    input_text: DATA_COLUMN,
    'group': LABEL_COLUMN
}

all_df.rename(columns= real_names, inplace=True)

In [None]:
minus_mapper = {}
for i in range(n_levels):
  minus_mapper[i+1] = i
all_df = all_df.replace({LABEL_COLUMN: minus_mapper})

In [None]:

all_df = all_df.groupby('Split')

all_df = all_df[[DATA_COLUMN, LABEL_COLUMN]]
all_df.columns = [DATA_COLUMN, LABEL_COLUMN]

train_df = all_df.get_group('Train')
#train_df = train_df.head(4565)
dev_df = all_df.get_group('Dev')
test_df = all_df.get_group('Test')
#tune_df = all_df.get_group('Tune')



In [None]:
set_seed(42)

In [None]:
train = datasets.Dataset.from_pandas(train_df)
dev = datasets.Dataset.from_pandas(dev_df)
test = datasets.Dataset.from_pandas(test_df)
#tune = datasets.Dataset.from_pandas(tune_df)
dataset = load_dataset("labr") #dump loading .. only to match the dataset template from huggingface
dataset['train'] = train
#dataset['tune'] = tune
dataset['dev'] = dev
dataset['test'] = test

In [None]:
train_size = len(dataset['train'])
dev_size = len(dataset['dev'])
test_size = len(dataset['test'])

# Define subsets of the training data
# train_size // 8, train_size // 6, train_size // 4, [train_size // 2,
subset_sizes =  [train_size ]

In [None]:
def model_init():
  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = n_levels, ignore_mismatched_sizes=True)
  model.dist_matix = [[abs(i-j) for i in range(model.num_labels)] for j in range(model.num_labels)]
  for param in model.parameters(): param.data = param.data.contiguous()
  return model

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds,digits=4))
  print(confusion_matrix(p.label_ids,preds))

  #f1_Positive = f1_score(p.label_ids,preds,pos_label=1,average='binary')
  #f1_Negative = f1_score(p.label_ids,preds,pos_label=0,average='binary')
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  #QWK = cohen_kappa_score(p.label_ids, preds, weights='quadratic')
  return {
      #'f1_pos': f1_Positive,
      #'f1_neg': f1_Negative,
      'macro_f1' : macro_f1,
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
      #'Quadratic Weighted Kappa': QWK
  }

In [None]:

def argmax(iterable):
    return max(enumerate(iterable), key=lambda x: x[1])[0]

In [None]:
save_dir = "content/bert_content"  # SAVE DIRECTORY
out_xlsx = "../data/content_pred.xlsx"
# Create subset of the training data
# train_subset = dataset['train'].select(range(subset_size))
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example[DATA_COLUMN], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
#tokenized_datasets = tokenized_datasets.remove_columns(LABEL_COLUMN)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(save_dir,
                              evaluation_strategy="epoch",
                              num_train_epochs=6,
                              per_device_train_batch_size= 64,
                              per_device_eval_batch_size=16,
                              load_best_model_at_end=True,
                              metric_for_best_model="eval_loss",
                              greater_is_better=False,
                              save_strategy="epoch",
                              #overwrite_output_dir=True,
                              #save_steps=496,
                              save_total_limit=1,
                              # push_to_hub=True,
                              # hub_token='TOKEN',
                              #learning_rate=lr
                              )
trainer = loss_function(model_init=model_init,
              args = training_args,
              train_dataset = tokenized_datasets['train'],
              eval_dataset = tokenized_datasets['dev'],
              data_collator=data_collator,
              tokenizer=tokenizer,
              compute_metrics = compute_metrics)
trainer.train()
trainer.save_model(save_dir)

preds, labels, metrics = trainer.predict(tokenized_datasets['dev'])


texts = []
original_texts = []
labels = []
predictions = []


for i in range(len(preds)):
  texts.append(list(dev_df['text'])[i])

  labels.append(list(dev_df['label'])[i]+1)
  predictions.append(argmax(preds[i])+1)


v = {
    #'original text': original_texts,
    'text': texts,
    'label': labels,
    'prediction': predictions,
}


final_df = pd.DataFrame.from_dict(v)
final_df.to_excel(out_xlsx ,index=False)

Map:   0%|          | 0/52521 [00:00<?, ? examples/s]

Map:   0%|          | 0/7268 [00:00<?, ? examples/s]

Map:   0%|          | 0/8393 [00:00<?, ? examples/s]

  trainer = loss_function(model_init=model_init,
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnour-rabih[0m ([33mnoor-rabie[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Macro F1,Macro Precision,Macro Recall,Accuracy
1,1.2469,1.023278,0.582677,0.589554,0.587055,0.599071
2,0.8745,1.038051,0.591179,0.607137,0.600125,0.612415
3,0.6973,1.091695,0.593912,0.614896,0.606803,0.619087
4,0.4263,1.31794,0.580141,0.582574,0.595075,0.600143
5,0.2791,1.527498,0.58237,0.592132,0.587428,0.60193
6,0.2086,1.693421,0.580508,0.586231,0.589197,0.599071


              precision    recall  f1-score   support

           0     0.5947    0.6230    0.6085       252
           1     0.4561    0.6218    0.5262       468
           2     0.5115    0.5691    0.5387       666
           3     0.6324    0.6392    0.6358       740
           4     0.7038    0.6953    0.6995      1480
           5     0.5720    0.6091    0.5899      2466
           6     0.6267    0.5551    0.5887      1787
           7     0.6193    0.3839    0.4740       534

    accuracy                         0.5991      8393
   macro avg     0.5896    0.5871    0.5827      8393
weighted avg     0.6046    0.5991    0.5986      8393

[[ 157   49   11    2    2   30    1    0]
 [  31  291   61   12   13   58    0    2]
 [  27  138  379   19   26   75    1    1]
 [   7   34   76  473   53   85   12    0]
 [   6   35   46   78 1029  240   41    5]
 [  34   84  143  123  257 1502  293   30]
 [   2    5   23   30   75  572  992   88]
 [   0    2    2   11    7   64  243  205]]
    

              precision    recall  f1-score   support

           0     0.5947    0.6230    0.6085       252
           1     0.4561    0.6218    0.5262       468
           2     0.5115    0.5691    0.5387       666
           3     0.6324    0.6392    0.6358       740
           4     0.7038    0.6953    0.6995      1480
           5     0.5720    0.6091    0.5899      2466
           6     0.6267    0.5551    0.5887      1787
           7     0.6193    0.3839    0.4740       534

    accuracy                         0.5991      8393
   macro avg     0.5896    0.5871    0.5827      8393
weighted avg     0.6046    0.5991    0.5986      8393

[[ 157   49   11    2    2   30    1    0]
 [  31  291   61   12   13   58    0    2]
 [  27  138  379   19   26   75    1    1]
 [   7   34   76  473   53   85   12    0]
 [   6   35   46   78 1029  240   41    5]
 [  34   84  143  123  257 1502  293   30]
 [   2    5   23   30   75  572  992   88]
 [   0    2    2   11    7   64  243  205]]


In [None]:
preds, labels, metrics = trainer.predict(tokenized_datasets['test'])


texts = []
original_texts = []
labels = []
predictions = []
# ids = []

for i in range(len(preds)):
  texts.append(list(test_df['text'])[i])

  labels.append(list(test_df['label'])[i]+1)
  # ids.append(list(test_df['id'])[i])
  predictions.append(argmax(preds[i])+1)


v = {
    #'original text': original_texts,
    'text': texts,
    'label': labels,
    'prediction': predictions,
    # 'ID':ids,
}
out_xlsx = "/content/drive/MyDrive/MBZ/Thesis/content/content_test_pred.xlsx"


final_df = pd.DataFrame.from_dict(v)
final_df.to_excel(out_xlsx ,index=False)

              precision    recall  f1-score   support

           0     0.5699    0.5263    0.5473       209
           1     0.4497    0.6818    0.5420       374
           2     0.5145    0.5722    0.5419       526
           3     0.6123    0.6571    0.6339       560
           4     0.7441    0.7018    0.7223      1214
           5     0.5578    0.5969    0.5767      1965
           6     0.5865    0.5554    0.5706      1678
           7     0.6412    0.4191    0.5069       742

    accuracy                         0.5919      7268
   macro avg     0.5845    0.5889    0.5802      7268
weighted avg     0.5999    0.5919    0.5917      7268

[[ 110   65   13    0    2   19    0    0]
 [  26  255   42    7    6   38    0    0]
 [  27  106  301   14   12   64    2    0]
 [   7   30   46  368   45   54    7    3]
 [   3   35   40   68  852  173   33   10]
 [  18   71  124  102  156 1173  287   34]
 [   2    4   14   27   63  509  932  127]
 [   0    1    5   15    9   73  328  311]]
