In [1]:
import pandas as pd
import numpy as np

In [2]:
ZAEBUC_AR_COR = pd.read_csv('ZAEBUC-v1.0/AR-all.extracted.corrected.analyzed.corrected-FINAL.tsv', encoding='utf_8',sep='\t')
words_df = ZAEBUC_AR_COR[ZAEBUC_AR_COR['Word'].notna()]

In [3]:
ZAEBUC_AR_ALL = pd.read_csv('ZAEBUC-v1.0/AR-all.alignment-FINAL.tsv', encoding='utf_8',sep='\t')

In [114]:
ZAEBUC_AR_ALL


Unnamed: 0,Document,Raw,Corrected,Operation
0,AR-030-268469,وسائل,وسائل,NO_CHANGE
1,AR-030-268469,التواصل,التواصل,NO_CHANGE
2,AR-030-268469,الاجتماعي,الاجتماعي,NO_CHANGE
3,AR-030-268469,لها,لها,NO_CHANGE
4,AR-030-268469,اضرار,أضرار,EDIT
...,...,...,...,...
33761,AR-130-99787,المجتمع,المجتمع,NO_CHANGE
33762,AR-130-99787,و,,DELETE
33763,AR-130-99787,كيفية,وكيفية,EDIT
33764,AR-130-99787,إستعمالهم,استعمالهم,EDIT


In [4]:
ZAEBUC_AR_ALL['Operation'].unique()

array(['NO_CHANGE', 'EDIT', 'DELETE', 'INSERT'], dtype=object)

In [5]:
error_rate_df = ZAEBUC_AR_ALL[['Document','Operation']].groupby('Document').aggregate({'Operation': (lambda x: 1- np.sum(x=='NO_CHANGE')/len(x) )}).rename(columns = {'Operation':'error_rate'})

In [6]:
error_rate_df

Unnamed: 0_level_0,error_rate
Document,Unnamed: 1_level_1
AR-030-268469,0.455621
AR-030-386369,0.237179
AR-030-81027,0.585366
AR-030-81757,0.493927
AR-030-83625,0.428571
...,...
AR-130-99351,0.245283
AR-130-99438,0.059322
AR-130-99442,0.311475
AR-130-99590,0.126214


In [7]:
tokenized_essays_df = words_df[['Document','Auto_Tokenization']].groupby(by = 'Document').agg({'Auto_Tokenization': ' '.join})
tokenized_essays_df['Auto_Tokenization'] = tokenized_essays_df['Auto_Tokenization'].apply(lambda x : x.replace('+', ' '))

In [8]:
tokenized_essays_df

Unnamed: 0_level_0,Auto_Tokenization
Document,Unnamed: 1_level_1
AR-030-268469,وسائل التواصل الاجتماعي ل ها أضرار و فوائد كثي...
AR-030-386369,تعد وسائل التواصل الاجتماعي من أكبر المؤثرات ع...
AR-030-81027,قام انتشار وسائل التواصل الاجتماعي ب شكل كبير ...
AR-030-81757,وسائل التواصل الاجتماعي لقد تطورت وسائل المعرف...
AR-030-83625,من أشهر وسائل الاتصال ب الآخرين هي الاجتماعية .
...,...
AR-130-99351,ظهور الأجهزة الإلكترونية أدى إلى ظهور وسائل ال...
AR-130-99438,وسائل التواصل الاجتماعي منذ انتشار وسائل التوا...
AR-130-99442,وسائل التواصل الاجتماعي إن التواصل الاجتماعي ل...
AR-130-99590,التسامح أمر مهم جدا يجب على الفرد أخذ ه ب جدية...


In [98]:
# def tokenize(doc):
#     doc = doc.split(' ')
#     doc = tokenizer.tokenize(doc)
#     tokens = []
#     for word in doc:
#         word = word.replace('+_', ',').replace('_+',',').split(',')
#         for tok in word:
#             tokens.append(tok)
#     return tokens

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (1, 1), min_df=2, max_df=0.9 )
doc2vec = vectorizer.fit_transform(tokenized_essays_df['Auto_Tokenization'])
doc2vec = (doc2vec.toarray())
print("\n\nScores : \n", doc2vec)



Scores : 
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.10340822 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [10]:
tokenized_essays_df['Auto_POS'] = words_df[['Document', 'Auto_POS']].groupby(by = 'Document', as_index = True).agg({'Auto_POS': ' '.join})
tokenized_essays_df['Auto_POS'] = tokenized_essays_df['Auto_POS'].apply(lambda x: x.replace('+', ' '))

In [12]:
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (1, 3), min_df=2, max_df=0.9)
pos2vec = vectorizer.fit_transform(tokenized_essays_df['Auto_POS'])
pos2vec = (pos2vec.toarray())
print("\n\nScores : \n", pos2vec)



Scores : 
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.03757249 0.         0.05357857 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.09170559 0.         0.04359089 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.04480017 0.         0.06388529 ... 0.         0.         0.        ]]


In [13]:
import xmltodict
docs = ZAEBUC_AR_COR['Document'].apply(lambda x: x if x.startswith('<') else np.nan).dropna()

grades = []
word_count = []

for xml in docs:
    if xml != "</doc>":
        doc = xmltodict.parse(xml)
        grades.append(doc["doc"]["@CEFR"])
        word_count.append(doc["doc"]["@word_count"])

In [14]:
X = np.concatenate((doc2vec,pos2vec, np.array(word_count).reshape(-1,1), np.array(error_rate_df['error_rate']).reshape(-1,1) ), axis = 1).astype(float)

In [20]:
# SVM classifier for X and grades

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, grades, test_size = 0.20, stratify=grades, random_state = 30)

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

y_pred = svclassifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))



[[ 0  0  0  0  2]
 [ 1 16  4  0  1]
 [ 0  1 15  0  0]
 [ 0  0  2  0  0]
 [ 0  0  0  0  1]]
              precision    recall  f1-score   support

          A2       0.00      0.00      0.00         2
          B1       0.94      0.73      0.82        22
          B2       0.71      0.94      0.81        16
          C1       0.00      0.00      0.00         2
Unassessable       0.25      1.00      0.40         1

    accuracy                           0.74        43
   macro avg       0.38      0.53      0.41        43
weighted avg       0.75      0.74      0.73        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
# random forest classifier for X and grades

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=20, criterion='gini', random_state=22, max_depth=5, min_samples_leaf=5)

rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

[[ 0  0  1  0  0]
 [ 0  0  2  0  0]
 [ 0  0 20  2  0]
 [ 0  0 12  4  0]
 [ 0  0  2  0  0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.54      0.91      0.68        22
           4       0.67      0.25      0.36        16
           5       0.00      0.00      0.00         2

    accuracy                           0.56        43
   macro avg       0.24      0.23      0.21        43
weighted avg       0.52      0.56      0.48        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
# naive bayes classifier for X and grades

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_pred = gnb.predict(X_test)

print(confusion_matrix(y_test,gnb_pred))
print(classification_report(y_test,gnb_pred))

[[ 0  0  0  1  0]
 [ 0  0  1  1  0]
 [ 0  0 14  8  0]
 [ 0  0 12  4  0]
 [ 0  0  2  0  0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.48      0.64      0.55        22
           4       0.29      0.25      0.27        16
           5       0.00      0.00      0.00         2

    accuracy                           0.42        43
   macro avg       0.15      0.18      0.16        43
weighted avg       0.35      0.42      0.38        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
# neural network classifier for X and grades

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))

mlp.fit(X_train, y_train)

mlp_pred = mlp.predict(X_test)

print(confusion_matrix(y_test,mlp_pred))
print(classification_report(y_test,mlp_pred))

[[ 1  0  0  0  0]
 [ 0  0  2  0  0]
 [ 1  0 20  1  0]
 [ 0  0 11  5  0]
 [ 0  0  1  1  0]]
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           2       0.00      0.00      0.00         2
           3       0.59      0.91      0.71        22
           4       0.71      0.31      0.43        16
           5       0.00      0.00      0.00         2

    accuracy                           0.60        43
   macro avg       0.36      0.44      0.36        43
weighted avg       0.58      0.60      0.54        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
grades = list(map(lambda x:'A0' if (x == 'Unassessable') else x, grades))

In [22]:
def grades_to_num(grades):
    num_grades = []
    for x in grades:
        match x:
            case 'A0':
                num_grades.append(0)
            case 'A1':
                num_grades.append(1)
            case 'A2':
                num_grades.append(2)
            case 'B1':
                num_grades.append(3)
            case 'B2':
                num_grades.append(4)
            case 'C1':
                num_grades.append(5)
            case 'C2':
                num_grades.append(6)
    return num_grades
num_grades = grades_to_num(grades)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, num_grades, test_size = 0.20, stratify = num_grades)

In [24]:
## given a dataset X and grades y, return a dataset of pair-wise differences and labels (+,-) 
def to_pairs(X, y):
    paired_X = list()
    paired_y = list()
    for i in range(len(X)):
        for k in range(i+1, len(X), 1):
                paired_X.append(np.subtract(X[i], X[k]))
                paired_y.append(y[i] > y[k])
    return paired_X, paired_y

In [25]:
X_train_diff, y_train_diff = to_pairs(X_train, y_train)

In [26]:
X_test_diff, y_test_diff = to_pairs(X_test, y_test)

In [27]:
from sklearn.model_selection import GridSearchCV
# param_grid = {'C': [0.1,1, 10, 100], 'kernel' : ['linear']}

# grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
# grid.fit(X_train_diff,y_train_diff)

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train_diff, y_train_diff)

y_pred = svclassifier.predict(X_test_diff)

print(confusion_matrix(y_test_diff,y_pred))
print(classification_report(y_test_diff,y_pred))



[[483  49]
 [178 193]]
              precision    recall  f1-score   support

       False       0.73      0.91      0.81       532
        True       0.80      0.52      0.63       371

    accuracy                           0.75       903
   macro avg       0.76      0.71      0.72       903
weighted avg       0.76      0.75      0.74       903



In [28]:
from sklearn.linear_model import LinearRegression

svc_fitted_X_train = svclassifier.coef_ @ np.transpose(X_train)
svc_fitted_X_train = svc_fitted_X_train.reshape(-1,1) 
svc_fitted_X_test = svclassifier.coef_ @ np.transpose(X_test)
svc_fitted_X_test = svc_fitted_X_test.reshape(-1,1) 

lm= LinearRegression(fit_intercept=True).fit(svc_fitted_X_train, y_train)
print(lm.predict(svc_fitted_X_train))
print(y_train)

[5.18665215 4.04730381 4.04858788 4.04648785 4.0466915  4.05201913
 2.91404336 2.91371026 4.04546076 2.91232702 4.04618315 4.05231736
 2.91237224 4.0492652  2.91219017 2.91130448 2.91166974 4.05266667
 1.78179111 0.83264642 4.04551925 2.91146243 2.90842551 5.18419844
 0.83259025 4.04969945 2.90763601 2.91249213 5.18875025 2.90890169
 2.91044502 2.90945366 2.91459067 2.91351035 2.90970602 2.91301125
 5.18539528 4.05312152 4.04801162 4.04750743 2.91035789 2.91141282
 1.7798873  5.19113363 4.05108492 5.18478311 2.91237724 2.91116106
 4.04664531 4.04775854 4.04821061 2.91166476 4.05020035 2.91022924
 2.90979001 2.90704884 4.04584475 4.04741216 2.91307265 4.05287043
 4.04749495 2.91164641 2.90547357 4.05013206 2.9150042  4.04760643
 2.9121568  2.9110789  4.0471286  4.04568905 2.91125736 4.04552821
 2.91219958 2.91268866 4.05007154 5.18229021 2.91354098 2.90739792
 2.9079553  0.82344613 2.90589888 2.9100898  2.91311098 4.05164965
 4.0483985  2.90763309 2.91198123 4.04605917 2.91409127 5.1864

In [29]:
print(np.concatenate((np.array(lm.predict(svc_fitted_X_test)).reshape(-1,1), np.array(y_test).reshape(-1,1)), axis = 1))

[[2.56405993 3.        ]
 [3.92196842 5.        ]
 [3.95858607 4.        ]
 [4.18418967 5.        ]
 [3.71563022 4.        ]
 [4.47749463 4.        ]
 [2.30277808 3.        ]
 [3.36015901 3.        ]
 [3.13563513 3.        ]
 [3.53387512 4.        ]
 [3.82528595 4.        ]
 [2.64501417 2.        ]
 [3.69560611 4.        ]
 [2.62662155 3.        ]
 [2.57996462 2.        ]
 [4.81864589 4.        ]
 [2.96885315 4.        ]
 [3.126192   4.        ]
 [2.72698879 4.        ]
 [3.07727581 3.        ]
 [3.36371476 3.        ]
 [3.22932604 3.        ]
 [3.12771925 3.        ]
 [2.79229051 3.        ]
 [3.37710568 4.        ]
 [2.60633526 3.        ]
 [3.04484355 3.        ]
 [3.58762539 4.        ]
 [3.86327705 4.        ]
 [3.41133849 4.        ]
 [2.98200527 3.        ]
 [3.13494847 3.        ]
 [3.42902048 3.        ]
 [3.53590606 3.        ]
 [3.62238079 4.        ]
 [3.58887415 3.        ]
 [3.54300519 3.        ]
 [2.18900118 0.        ]
 [3.97448157 4.        ]
 [3.82367153 3.        ]


In [30]:
predicted_grades = np.floor(np.array(lm.predict(svc_fitted_X_test)).reshape(-1,1) + 0.5)
print(np.concatenate((predicted_grades, np.array(y_test).reshape(-1,1)), axis = 1))

[[3. 3.]
 [4. 5.]
 [4. 4.]
 [4. 5.]
 [4. 4.]
 [4. 4.]
 [2. 3.]
 [3. 3.]
 [3. 3.]
 [4. 4.]
 [4. 4.]
 [3. 2.]
 [4. 4.]
 [3. 3.]
 [3. 2.]
 [5. 4.]
 [3. 4.]
 [3. 4.]
 [3. 4.]
 [3. 3.]
 [3. 3.]
 [3. 3.]
 [3. 3.]
 [3. 3.]
 [3. 4.]
 [3. 3.]
 [3. 3.]
 [4. 4.]
 [4. 4.]
 [3. 4.]
 [3. 3.]
 [3. 3.]
 [3. 3.]
 [4. 3.]
 [4. 4.]
 [4. 3.]
 [4. 3.]
 [2. 0.]
 [4. 4.]
 [4. 3.]
 [3. 3.]
 [3. 3.]
 [3. 3.]]


In [31]:
np.corrcoef(np.floor(np.array(lm.predict(svc_fitted_X_test)) + 0.5), y_test)

array([[1.        , 0.61396877],
       [0.61396877, 1.        ]])

In [32]:
cm = confusion_matrix(y_test, np.floor(np.array(lm.predict(svc_fitted_X_test)) + 0.5))
print(cm)

[[ 0  1  0  0  0]
 [ 0  0  2  0  0]
 [ 0  1 17  4  0]
 [ 0  0  5 10  1]
 [ 0  0  0  2  0]]


In [33]:
print(classification_report(y_test,np.floor(np.array(lm.predict(svc_fitted_X_test)) + 0.5)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.71      0.77      0.74        22
           4       0.62      0.62      0.62        16
           5       0.00      0.00      0.00         2

    accuracy                           0.63        43
   macro avg       0.27      0.28      0.27        43
weighted avg       0.59      0.63      0.61        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Linear Classifier (SVC)

In [34]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(svc_fitted_X_train, y_train) 
svm_predictions = svm_model_linear.predict(svc_fitted_X_test)

# model accuracy for X_test   
accuracy = svm_model_linear.score(svc_fitted_X_test, y_test) 
print('accuracy= {}'.format(accuracy))
# creating a confusion matrix 
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

accuracy= 0.627906976744186
[[ 0  1  0  0  0]
 [ 0  0  2  0  0]
 [ 0  1 17  4  0]
 [ 0  0  5 10  1]
 [ 0  0  0  2  0]]


In [36]:
print(classification_report(y_test,svm_predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.71      0.77      0.74        22
           4       0.62      0.62      0.62        16
           5       0.00      0.00      0.00         2

    accuracy                           0.63        43
   macro avg       0.27      0.28      0.27        43
weighted avg       0.59      0.63      0.61        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
