In [6]:
import pandas as pd
import numpy as np

# Import Data 

In [289]:
ZAEBUC_COR = pd.read_csv('ZAEBUC-v1.0/AR-all.extracted.corrected.analyzed.corrected-FINAL.tsv', encoding='utf_8',sep='\t')
ZAEBUC_aligned = pd.read_csv('ZAEBUC-v1.0/AR-all.alignment-FINAL.tsv', encoding='utf_8',sep='\t')

In [154]:
## group raw essays
ZAEBUC_RAW = ZAEBUC_aligned[['Document', 'Raw']].dropna( subset = ['Raw']).groupby('Document').agg({'Raw': lambda x: ' '.join(np.array(x, dtype=str))})
ZAEBUC_RAW = ZAEBUC_RAW.rename_axis('Document').reset_index()

In [None]:
import xmltodict
docs = ZAEBUC_COR['Document'].apply(lambda x: x if x.startswith('<') else np.nan).dropna()

grades = []
word_count = []

for xml in docs:
    if xml != "</doc>":
        doc = xmltodict.parse(xml)
        grades.append(doc["doc"]["@CEFR"])
        word_count.append(doc["doc"]["@word_count"])

# Working with Raw Data:

## Using doc2vec:

In [455]:
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

# The tokenizer expects pre-tokenized text
sentence = 'فتنفست الصعداء'.split()

# Load a pretrained disambiguator to use with a tokenizer
mle = MLEDisambiguator.pretrained('calima-msa-r13')


# By specifying `split=True`, the morphological tokens are output as seperate
# strings.
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok', split=True)
tokens = tokenizer.tokenize(sentence)
print(tokens)



['ف+', 'تنفست', 'ال+', 'صعداء']


In [456]:
tokenizer.tokenize('DDE فتنفست '.split())

['DDE', 'ف+', 'تنفست']

In [115]:
# tokenize raw data and add it to the dataframe
ZAEBUC_RAW['Tokenized'] = ZAEBUC_RAW['Raw'].apply(lambda x: tokenizer.tokenize(x.split()))

In [None]:
#get doc2vec vectors for raw data
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#tag the documents
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(ZAEBUC_RAW['Tokenized'])]

#train the model
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")

In [116]:
model= Doc2Vec.load("d2v.model")

# get vectors for the raw data
vectors = []
for i in range(len(ZAEBUC_RAW)):
    vectors.append(model.docvecs[i])

#add vectors to dataframe
ZAEBUC_RAW['Doc2Vec Embeddings'] = vectors



  vectors.append(model.docvecs[i])


In [54]:
# get the pair with highest cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# get all combinations of vectors
comb = combinations(vectors, 2)
# get the pair with highest cosine similarity
max_cos = 1
for i in list(comb):
    cos = cosine_similarity([i[0]], [i[1]])
    if cos > max_cos:
        max_cos = cos
        pair = i

for k in range(len(vectors)):
    if(all(vectors[k] == pair[0])):
        index1 = k
    if(all(vectors[k] == pair[1])):
        index2 = k

# get the raw data of the pair
raw1 = ZAEBUC_RAW['Raw'][index1]
raw2 = ZAEBUC_RAW['Raw'][index2]

  raw1 = ZAEBUC_RAW['Raw'][index1]
  raw2 = ZAEBUC_RAW['Raw'][index2]


## Using Bert Embeddings:

In [198]:
#split the raw data into sentences with delimiter . or ،

ZAEBUC_RAW['Sentences'] = ZAEBUC_RAW['Raw'].apply(lambda x: x.replace('،', '.').split('.'))




In [199]:
import xmltodict
docs = ZAEBUC_COR['Document'].apply(lambda x: x if x.startswith('<') else np.nan).dropna()

grades = []
word_count = []

for xml in docs:
    if xml != "</doc>":
        doc = xmltodict.parse(xml)
        grades.append(doc["doc"]["@CEFR"])
        word_count.append(doc["doc"]["@word_count"])
ZAEBUC_RAW['grade'] = grades

In [218]:
def essay2chunks(essay: str):
    sents = []
    split_essay = essay.replace('.', '.+').split('+')
    if('' in split_essay):
        split_essay.remove('')
    for sent in split_essay:
        if(len(sent.split())<50):
            sents.append(sent)
        else:
            for i in range(len(sent.split())//50):
                sents.append(' '.join(sent.split()[i*50:(i+1)*50]))
            sents.append(' '.join(sent.split()[(i+1)*50:]))
    return sents

In [219]:
ZAEBUC_RAW['3 Sentences'] = ZAEBUC_RAW['Sentences'].apply(lambda x: [x[i:i+3]  for i in range(0, len(x), 3)])


In [220]:
ZAEBUC_RAW['chunks'] = ZAEBUC_RAW['Raw'].apply(lambda x: essay2chunks(x))
ZAEBUC_RAW

Unnamed: 0,Document,Raw,Sentences,grade,3 Sentences,chunks
0,AR-030-268469,وسائل التواصل الاجتماعي لها اضرار و فوائد كثير...,[وسائل التواصل الاجتماعي لها اضرار و فوائد كثي...,B1,[[وسائل التواصل الاجتماعي لها اضرار و فوائد كث...,[وسائل التواصل الاجتماعي لها اضرار و فوائد كثي...
1,AR-030-386369,تعد وسائل التواصل الاجتماعي من اكبر المؤثرات ع...,[تعد وسائل التواصل الاجتماعي من اكبر المؤثرات ...,B2,[[تعد وسائل التواصل الاجتماعي من اكبر المؤثرات...,[تعد وسائل التواصل الاجتماعي من اكبر المؤثرات ...
2,AR-030-81027,قام انتشارالوساءل للتواصل الاجتماعية بشكل كبير...,[قام انتشارالوساءل للتواصل الاجتماعية بشكل كبي...,A2,[[قام انتشارالوساءل للتواصل الاجتماعية بشكل كب...,[قام انتشارالوساءل للتواصل الاجتماعية بشكل كبي...
3,AR-030-81757,وسائل التواصل الاجتماعي لقد تطورت وسائل المعرف...,[وسائل التواصل الاجتماعي لقد تطورت وسائل المعر...,B2,[[وسائل التواصل الاجتماعي لقد تطورت وسائل المع...,[وسائل التواصل الاجتماعي لقد تطورت وسائل المعر...
4,AR-030-83625,من اشهر وساءل الاتصال بالآخرين هي الاجتماعية،,"[من اشهر وساءل الاتصال بالآخرين هي الاجتماعية, ]",Unassessable,[[من اشهر وساءل الاتصال بالآخرين هي الاجتماعية...,[من اشهر وساءل الاتصال بالآخرين هي الاجتماعية،]
...,...,...,...,...,...,...
209,AR-130-99351,ظهور الأجهزة الإلكترونية أدى إلى ظهور وسائل ال...,[ظهور الأجهزة الإلكترونية أدى إلى ظهور وسائل ا...,B2,[[ظهور الأجهزة الإلكترونية أدى إلى ظهور وسائل ...,[ظهور الأجهزة الإلكترونية أدى إلى ظهور وسائل ا...
210,AR-130-99438,وسائل التواصل الاجتماعي منذ انتشار وسائل التوا...,[وسائل التواصل الاجتماعي منذ انتشار وسائل التو...,B2,[[وسائل التواصل الاجتماعي منذ انتشار وسائل الت...,[وسائل التواصل الاجتماعي منذ انتشار وسائل التو...
211,AR-130-99442,وسائل التواصل الإجتماعي .إنّ التواصل الإجتماعي...,"[وسائل التواصل الإجتماعي , إنّ التواصل الإجتما...",B2,"[[وسائل التواصل الإجتماعي , إنّ التواصل الإجتم...","[وسائل التواصل الإجتماعي ., إنّ التواصل الإجتم..."
212,AR-130-99590,التسامح أمر مهم جداً يجب على الفرد اخذه بجدية،...,[التسامح أمر مهم جداً يجب على الفرد اخذه بجدية...,B1,[[التسامح أمر مهم جداً يجب على الفرد اخذه بجدي...,[التسامح أمر مهم جداً يجب على الفرد اخذه بجدية...


In [167]:
df = pd.DataFrame(columns=['Document', '3 Sentences', 'grade'])
for i in range(len(ZAEBUC_RAW)):
    df = pd.concat((df,pd.DataFrame({'Document': [ZAEBUC_RAW['Document'][i]]*len(ZAEBUC_RAW['3 Sentences'][i]), '3 Sentences': ZAEBUC_RAW['3 Sentences'][i],
                                     'grade': [ZAEBUC_RAW['grade'][i]]*len(ZAEBUC_RAW['3 Sentences'][i])})))

In [195]:
df['3 Sentences'] = df['3 Sentences'].apply(lambda x: ' '.join(x))
# drop empty sentences
df = df[df['3 Sentences'] != '']

In [224]:
#make a dataframe for chunked data and grades
chunks_df = pd.DataFrame(columns=['Document', 'chunks', 'grade'])
for i in range(len(ZAEBUC_RAW)):
    chunks_df = pd.concat((chunks_df,pd.DataFrame({'Document': [ZAEBUC_RAW['Document'][i]]*len(ZAEBUC_RAW['chunks'][i]), 'chunks': ZAEBUC_RAW['chunks'][i],
                                     'grade': [ZAEBUC_RAW['grade'][i]]*len(ZAEBUC_RAW['chunks'][i])})))

In [416]:
#Get the Arabert embeddings for the chunked data
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")

def get_embeddings(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states.detach().numpy()

chunks_df['Arabert Embeddings'] = chunks_df['chunks'].apply(lambda x: get_embeddings(x))
chunks_df['Arabert Embeddings'] = chunks_df['Arabert Embeddings'].apply(lambda x: x[0][0])

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Working with Human Annotated Data for a Baseline

In [407]:
corrected_essays= ZAEBUC_COR[['Document', 'Word']].dropna( subset = ['Word']).groupby('Document').agg({'Word': lambda x: ' '.join(np.array(x, dtype=str))})
corrected_essays = corrected_essays.rename_axis('Document').reset_index()
corrected_essays.rename(columns={'Word': 'Corrected essay'}, inplace=True)

## Feature Extraction:

### Using AraBert Embeddings


In [437]:
#Split corrected data into chunks and get the Arabert embeddings
df = corrected_essays.dropna()

In [439]:
df['Sentences'] = corrected_essays['Corrected essay'].apply(lambda x: essay2chunks(x))
df['grade'] = ZAEBUC_RAW['grade']

In [442]:
sent_df = pd.DataFrame(columns=['Document', 'sent', 'grade'])
for i in range(len(df)):
    sent_df = pd.concat((sent_df,pd.DataFrame({'Document': [df['Document'][i]]*len(df['Sentences'][i]), 'sent': df['Sentences'][i],
                                     'grade': [df['grade'][i]]*len(df['Sentences'][i])})))

sent_df['Arabert Embeddings'] = sent_df['sent'].apply(lambda x: get_embeddings(x))
sent_df['Arabert Embeddings'] = sent_df['Arabert Embeddings'].apply(lambda x: x[0][0])


In [453]:
# drop empty or nan sentences
sent_df = sent_df[sent_df['sent'] != '']
sent_df = sent_df[sent_df['sent'].notna()]

#drop sentnces with less than 3 words
sent_df = sent_df[sent_df['sent_len'] > 3]


### Using doc2vec

In [303]:
df = corrected_essays.rename_axis('Document').reset_index()
df.rename(columns={'Word': 'Corrected essays'}, inplace=True)

In [304]:
## add grades to the dataframe for each document
df = pd.merge(df, ZAEBUC_RAW[['Document', 'grade']], on='Document')


In [305]:
# tokenize and get embeddings for the corrected essays
df['Tokenized'] = df['Corrected essays'].apply(lambda x: tokenizer.tokenize(x.split()))

#tag the documents
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(df['Tokenized'])]

#train the model
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
# get vectors for the corrected essays
vectors = []
for i in range(len(df)):
    vectors.append(model.docvecs[i])

#add vectors to dataframe
df['Doc2Vec Embeddings'] = vectors

  vectors.append(model.docvecs[i])


In [306]:
df = df[['Document', 'grade', 'Doc2Vec Embeddings']]

### Using TF-IDF

In [290]:
tokenized_essays_df = ZAEBUC_COR[['Document','Auto_Tokenization']].dropna(subset='Auto_Tokenization').groupby(by = 'Document').agg({'Auto_Tokenization': ' '.join})

In [291]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (1, 1) )
doc2vec = vectorizer.fit_transform(tokenized_essays_df['Auto_Tokenization'])
doc2vec = (doc2vec.toarray())
print("\n\nScores : \n", doc2vec)



Scores : 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [292]:
df = pd.DataFrame({'Document': tokenized_essays_df.index, 'TFIDF Embeddings': doc2vec.tolist()})

### Other Features:

In [307]:
df['Auto_POS']= np.array(ZAEBUC_COR[['Document', 'Auto_POS']].dropna(subset= 'Auto_POS').groupby(by = 'Document', as_index = True).agg({'Auto_POS': ' '.join})['Auto_POS'])
df['Auto_POS'] = df['Auto_POS'].dropna().apply(lambda x: x.replace('+', ' '))

In [311]:
error_rate_df = ZAEBUC_aligned[['Document','Operation']].groupby('Document').aggregate({'Operation': (lambda x: 1- np.sum(x=='NO_CHANGE')/len(x) )}).rename(columns = {'Operation':'error_rate'})
df = pd.merge(df, error_rate_df, on='Document')

In [312]:
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (1, 2))
pos2vec = vectorizer.fit_transform(df['Auto_POS'])
pos2vec = (pos2vec.toarray())
print("\n\nScores : \n", pos2vec)



Scores : 
 [[0.19436809 0.         0.07920732 ... 0.         0.         0.02296755]
 [0.29714894 0.0171133  0.08351155 ... 0.         0.         0.        ]
 [0.32720461 0.         0.08333737 ... 0.         0.         0.        ]
 ...
 [0.1976791  0.04306373 0.07004921 ... 0.         0.0155246  0.        ]
 [0.2220965  0.         0.0808098  ... 0.         0.02865505 0.        ]
 [0.26979631 0.02503347 0.07635085 ... 0.         0.         0.        ]]


In [315]:
df['POS2Vec Embeddings'] = pos2vec.tolist()
df = df.drop(columns = ['Auto_POS'])

In [318]:
## add word count to the dataframe
df['word count'] = corrected_essays['Corrected essay'].apply(lambda x: len(x.split()))

In [323]:
X = np.concatenate((np.array(df['Doc2Vec Embeddings'].tolist()), np.array(df['error_rate'].tolist()).reshape(-1,1), np.array(df['POS2Vec Embeddings'].tolist()), np.array(df['word count'].tolist()).reshape(-1,1)), axis = 1)

In [326]:
grades = df['grade']

## Modelling

## Using entire essays

### Using Doc2Vec

In [374]:
# SVM classifier for X and grades

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, grades, test_size = 0.20)

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

y_pred = svclassifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))



[[ 1  0  0  0  0]
 [ 0  1  2  0  0]
 [ 0  0 14  2  0]
 [ 0  0 11 10  0]
 [ 0  0  0  1  1]]
              precision    recall  f1-score   support

          A0       1.00      1.00      1.00         1
          A2       1.00      0.33      0.50         3
          B1       0.52      0.88      0.65        16
          B2       0.77      0.48      0.59        21
          C1       1.00      0.50      0.67         2

    accuracy                           0.63        43
   macro avg       0.86      0.64      0.68        43
weighted avg       0.71      0.63      0.62        43



In [375]:
grades = list(map(lambda x:'A0' if (x == 'Unassessable') else x, grades))


In [376]:
grades_to_num = { 'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6, 'A0': 0}
num_grades = list(map(lambda x: grades_to_num[x], grades))

In [377]:
X_train, X_test, y_train, y_test = train_test_split(X, num_grades, test_size = 0.20, stratify = num_grades)

In [378]:
## given a dataset X and grades y, return a dataset of pair-wise differences and labels (+,-) 
def to_pairs(X, y):
    paired_X = list()
    paired_y = list()
    for i in range(len(X)):
        for k in range(i+1, len(X), 1):
                paired_X.append(np.subtract(X[i], X[k]))
                paired_y.append(y[i] > y[k])
    return paired_X, paired_y

In [379]:
X_train_diff, y_train_diff = to_pairs(X_train, y_train)

In [380]:
X_test_diff, y_test_diff = to_pairs(X_test, y_test)

In [381]:
from sklearn.model_selection import GridSearchCV
# param_grid = {'C': [0.1,1, 10, 100], 'kernel' : ['linear']}

# grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
# grid.fit(X_train_diff,y_train_diff)

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train_diff, y_train_diff)

y_pred = svclassifier.predict(X_test_diff)

print(confusion_matrix(y_test_diff,y_pred))
print(classification_report(y_test_diff,y_pred))



[[504 143]
 [ 99 157]]
              precision    recall  f1-score   support

       False       0.84      0.78      0.81       647
        True       0.52      0.61      0.56       256

    accuracy                           0.73       903
   macro avg       0.68      0.70      0.69       903
weighted avg       0.75      0.73      0.74       903



### Final Mapping to Grades with a Linear Classifier (SVC)

In [387]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(svc_fitted_X_train, y_train) 
svm_predictions = svm_model_linear.predict(svc_fitted_X_test)

# model accuracy for X_test   
accuracy = svm_model_linear.score(svc_fitted_X_test, y_test) 
print('accuracy= {}'.format(accuracy))
# creating a confusion matrix 
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

accuracy= 0.5813953488372093
[[ 0  0  1  0  0]
 [ 1  0  1  0  0]
 [ 2  0 14  6  0]
 [ 0  0  7  9  0]
 [ 0  0  0  0  2]]


In [388]:
print(classification_report(y_test,svm_predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.61      0.64      0.62        22
           4       0.60      0.56      0.58        16
           5       1.00      1.00      1.00         2

    accuracy                           0.58        43
   macro avg       0.44      0.44      0.44        43
weighted avg       0.58      0.58      0.58        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Using Sentences

In [None]:
X = 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, grades, test_size = 0.20)

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

y_pred = svclassifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))