In [None]:
import numpy as np
import pandas as pd

# Import Zaebuc data and split into training and testing

In [None]:
!pip install xmltodict

In [None]:
#file_path = path to Zaebuc/AR-all.extracted.corrected.analyzed.corrected-FINAL.tsv

all_extracted = pd.read_csv(file_path, sep='\t')

import xmltodict
docs = all_extracted['Document'].apply(lambda x: x if x.startswith('<') else np.nan).dropna()

grades = []
word_count = []

for xml in docs:
    if xml != "</doc>":
        doc = xmltodict.parse(xml)
        grades.append(doc["doc"]["@CEFR"])
        word_count.append(doc["doc"]["@word_count"])

In [None]:
#file_path = path to Zaebuc/AR-all.alignment-FINAL.tsv
cor_raw_aligned = pd.read_csv(file_path, sep='\t')

In [None]:
raw_essays = cor_raw_aligned.dropna(subset=['Raw']).groupby('Document').agg({'Raw': ' '.join})
raw_essays['grade'] = grades

from sklearn.model_selection import train_test_split
raw_train, raw_test = train_test_split(raw_essays, test_size=0.2, random_state=42, stratify = raw_essays['grade'])

# Import original and augmented essays

In [None]:
# raw_essays = pd.read_csv('raw_essays.csv') load 
augmented_essays = pd.read_csv('augmented_essays(2).csv', index_col=0)

In [None]:
raw_essays = raw_essays[raw_essays['grade']!= 'Unassessable']

In [None]:
from sklearn.model_selection import train_test_split
raw_train, raw_test = train_test_split(raw_essays, test_size=0.2, random_state=42, stratify = raw_essays['grade'])

In [None]:
X_train = pd.concat((augmented_essays.rename({'to_grade': 'grade'}, axis = 1).drop('from_grade', axis = 1), raw_train))

# Train Arabic-BERT Model

In [None]:
%pip install transformers==4.16
%pip install arabert
%pip install farasapy
%pip install pyarabic==0.6.14
%pip install sentencepiece==0.1.96

In [None]:
import torch
import random
import matplotlib.pyplot as plt
import copy

from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from torch.utils.data import DataLoader, Dataset
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, BertTokenizer, Trainer,
                          TrainingArguments)
from transformers.data.processors.utils import InputFeatures

In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(ClassificationDataset).__init__()
      """
      Args:
      text (List[str]): List of the training text
      target (List[str]): List of the training labels
      tokenizer_name (str): The tokenizer name (same as model_name).
      max_len (int): Maximum sentence length
      """
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map


    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())

      inputs = self.tokenizer(
          text,
          max_length=self.max_len,
          padding='max_length',
          truncation=True
      )
      return InputFeatures(**inputs,label=self.label_map[self.target[item]])

In [None]:
#import and define arabert preprocessor
from arabert.preprocess import ArabertPreprocessor
# model_name = 'aubmindlab/bert-base-arabertv2'
model_name = 'asafaya/bert-base-arabic'
# arabic_prep = ArabertPreprocessor(model_name)
tok = AutoTokenizer.from_pretrained(model_name)

In [None]:
raw_train, raw_val = train_test_split(raw_train, test_size=0.1, random_state=42, stratify = raw_train['grade'])

In [None]:
max_len = 512
label_map = {'A1' : 0, 'A2': 0, 'B1':1, 'B2':2, 'C1':3,'C2': 3 }
train_dataset = ClassificationDataset(
    X_train['Raw'].tolist(),
    X_train['grade'].tolist(),
    model_name,
    max_len,
    label_map
  )

val_dataset = ClassificationDataset(
    raw_val['Raw'].tolist(),
    raw_val['grade'].tolist(),
    model_name,
    max_len,
    label_map
  )
test_dataset = ClassificationDataset(
    raw_test['Raw'].tolist(),
    raw_test['grade'].tolist(),
    model_name,
    max_len,
    label_map
  )

In [None]:
# path_to_best_model = 
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(path_to_best_model, return_dict=True, num_labels=4)

In [None]:
# inverse_label_map = {0:'Unassessable', 1:'A1', 2:'A2', 3:'B1',4: 'B2',5: 'C1',6:'C2'}
inverse_label_map = { 0:'A', 1:'B1', 2:'B2',3: 'C'}

def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report([inverse_label_map[x] for x in p.label_ids],[inverse_label_map[x] for x in preds]))
  print(confusion_matrix(p.label_ids,preds))
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'accuracy': acc,
      'macro_precision': macro_precision,
      'macro_recall': macro_recall
  }

In [None]:
def set_seed(seed=42):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark = False

In [None]:
training_args = TrainingArguments(
    output_dir= "./14-12",
    adam_epsilon = 1e-8,
    learning_rate = 1e-5,
    fp16 = True, # enable this when using V100 or T4 GPU
    per_device_train_batch_size = 32, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2, # use this to scale batch size without needing more memory
    num_train_epochs= 10,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, # this allows to automatically get the best model at the end based on whatever metric we want
    metric_for_best_model = 'accuracy',
    greater_is_better = True,
    seed = 25
  )

set_seed(training_args.seed)

In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Train AraBERT at Sentence Level and Aggregate

In [None]:
X_train = X_train.reset_index().drop('index', axis = 1)

In [None]:
chunks_df = pd.DataFrame()
X_train['split'] = [x.split() for x in X_train['Raw'].tolist()]
for i in range(len(X_train)):
  chunks = []
  for j in range(len(X_train['split'][i])// 30):
    chunks.append(X_train['split'].tolist()[j*30:(j+1)*30])
  chunks.append(X_train['split'].tolist()[(j+1)*30:])
  chunks_df = pd.concat([chunks_df, pd.DataFrame({'Document': [X_train['Document'][i]]*len(chunks), 'chunk': chunks, 'grade': [X_train['grade'][i]] * len(chunks)})])

In [None]:
chunks_df['chunk'] = chunks_df['chunk'].apply(lambda x: ' '.join(x[0]))

In [None]:
chunks_df

In [None]:
chunks_train, chunks_val = train_test_split(chunks_df, test_size=0.1, random_state=42, stratify = chunks_df['grade'])

In [None]:
max_len = 60
# label_map = {'Unassessable': 0, 'A2': 1, 'B1':2, 'B2':3, 'C1':4}
train_dataset = ClassificationDataset(
    chunks_train['chunk'].tolist(),
    chunks_train['grade'].tolist(),
    model_name,
    max_len,
    label_map
  )
val_dataset = ClassificationDataset(
    chunks_val['chunk'].tolist(),
    chunks_val['grade'].tolist(),
    model_name,
    max_len,
    label_map
  )

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=5)

In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

### map back to final grades:

In [None]:
chunks_val['preds'] = trainer.predict(val_dataset).predictions.argmax(-1)

In [None]:
chunks_val.groupby('Document').agg({'preds': lambda x: np.max(x), 'grade': lambda x: x})

In [None]:
# chunks_val['grade'] =chunks_val['grade'].apply(lambda x: np.array(x).reshape(-1,1)[0])
# chunks_val['preds'] =chunks_val['preds'].apply(lambda x: inverse_label_map[x])

In [None]:
chunks_df_test = pd.DataFrame()
raw_test['split'] =raw_test['Raw'].apply(lambda x: x.split())
for i in range(len(raw_test)):
  chunks = []
  for j in range(len(raw_test['split'][i]) // 30):
    chunks.append(raw_test['split'][i][j*30:(j+1)*30])
  chunks.append(raw_test['split'][i][(j+1)*30:])
  chunks_df_test = pd.concat([chunks_df_test, pd.DataFrame({'Document': [raw_test.index[i]]*len(chunks), 'chunk': chunks, 'grade': [raw_test['grade'][i]] * len(chunks)})])

In [None]:
test_dataset = ClassificationDataset(
    chunks_df_test['chunk'].tolist(),
    chunks_df_test['grade'].tolist(),
    model_name,
    max_len,
    label_map
  )

In [None]:
chunks_df_test['prediction'] = trainer.predict(test_dataset).predictions.argmax(-1)

In [None]:
chunks_df_test = chunks_df_test.groupby('Document').agg({'prediction': lambda x: inverse_label_map[np.floor(np.mean(x)+0.5)], 'grade': lambda x: x})

In [None]:
chunks_df_test['grade'] =chunks_df_test['grade'].apply(lambda x: np.array(x).reshape(-1,1)[0])


In [None]:
chunks_df_test['grade'] = chunks_df_test['grade'].apply(lambda x: x[0])

In [None]:
print(classification_report(chunks_df_test['grade'], chunks_df_test['prediction']))