In [14]:
# open augmented_essays.csv
import pandas as pd

augmented_df = pd.read_csv('augmented_essays.tsv', encoding='utf_8',sep='\t')

zaebuc_df = pd.read_csv('raw_essays.tsv', encoding='utf_8',sep='\t')

In [15]:
# get 'Raw' and index and to_grade columns
grades = augmented_df['to_grade']

# create a new dataframe with only the columns we need
essays = augmented_df['Raw']

In [26]:
augmented_df['augmented'] = 1

In [54]:
zaebuc_df

Unnamed: 0,Document,Raw,grade,augmented
0,AR-030-268469,وسائل التواصل الاجتماعي لها اضرار و فوائد كثير...,B1,0
1,AR-030-386369,تعد وسائل التواصل الاجتماعي من اكبر المؤثرات ع...,B2,0
2,AR-030-81027,قام انتشارالوساءل للتواصل الاجتماعية بشكل كبير...,A2,0
3,AR-030-81757,وسائل التواصل الاجتماعي لقد تطورت وسائل المعرف...,B2,0
4,AR-030-83625,من اشهر وساءل الاتصال بالآخرين هي الاجتماعية،,Unassessable,0
...,...,...,...,...
209,AR-130-99351,ظهور الأجهزة الإلكترونية أدى إلى ظهور وسائل ال...,B2,0
210,AR-130-99438,وسائل التواصل الاجتماعي منذ انتشار وسائل التوا...,B2,0
211,AR-130-99442,وسائل التواصل الإجتماعي .إنّ التواصل الإجتماعي...,B2,0
212,AR-130-99590,التسامح أمر مهم جداً يجب على الفرد اخذه بجدية،...,B1,0


In [28]:
zaebuc_df['augmented'] = 0

In [16]:
# remove first and last character (')
essays = essays.str[1:-1]


In [17]:
# add Raw of zaebuc_df to essays and reset index
# set the value of augmented to 1 for the new entries
essays = essays.append(zaebuc_df['Raw']).reset_index(drop=True)


In [11]:
grades

0      A1
1      A2
2      C1
3      C2
4      C1
       ..
619    B2
620    B2
621    B2
622    B1
623    B1
Length: 624, dtype: object

In [18]:
grades = grades.append(zaebuc_df['grade']).reset_index(drop=True)

In [19]:
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
from camel_tools.tokenizers.word import simple_word_tokenize


mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')

msa_d3_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, scheme='atbtok', split=True)

In [31]:
essays[0]

'كان هناك الكثير من التحسينات من الناس الذين جاءوا إلينا من أماكن مختلفة. هم ساعدوا الامارات. أعطونا كل ما نحتاجه. كل ما نريده هو مكان آمن و بلا مشاكل. الامارات أعطتنا مكان آمن أولا. هذا هو السبب الناس يأتون من أماكن بعيدة إلى هنا، بنوا لنا مطاعم جميلة و مدارس و جامعات عالية، أيضا بنوا لنا شوارع و طرق جيدة و قوانين للجميع في الامارات. الأفكار الجميلة جاءت من أماكن مختلفة و الامارات بحثت كثيرا و ذهبت إلى أماكن أخرى لفهم العالم و الأفكار و كيفية بنائها بشكل جيد، هذا يأخذ الكثير من الوقت و العمل، أنا فخورة جدا بأن أكون من الامارات. '

In [21]:
simple_tokenized_essays = []
for essay in essays:
    simple_tokenized_essays.append(simple_word_tokenize(essay))

In [22]:
pos_tagged_sentences = []
lex_tagged_sentences = []
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tagger.default import DefaultTagger

mled = MLEDisambiguator.pretrained()
pos_tagger = DefaultTagger(mled, 'pos')
lex_tagger = DefaultTagger(mled, 'lex')

for essay in simple_tokenized_essays:
    pos_tagged_sentences.append(pos_tagger.tag(essay))
    lex_tagged_sentences.append(lex_tagger.tag(essay))


In [32]:

combined_words = [[i, pos_word, lex_word] for (i, pos_essay), (j, lex_essay) in zip(enumerate(pos_tagged_sentences), enumerate(lex_tagged_sentences)) for (pos_word, lex_word) in zip(pos_essay, lex_essay)]

In [33]:
# create dataframe from combined_words
words_df = pd.DataFrame(combined_words, columns=['id', 'pos', 'lex'])

In [34]:
flat_simple_tokenized_essays = [item for sublist in simple_tokenized_essays for item in sublist]
words_df['word'] = flat_simple_tokenized_essays

In [35]:
samer_df = pd.read_csv('./samer-readability-lexicon/SAMER-Readability-Lexicon.tsv', encoding='utf_8',sep='\t')

In [36]:
def get_levels(lemmas_pos):
    levels = []
    for lemma in lemmas_pos:
        level = samer_df.loc[samer_df['lemma#pos'] == lemma , 'readability (rounded average)']
        if level.empty:
            # add the readability score to the dataframe
            levels.append(0)
            
        else:
            levels.append(level.values[0])
    return levels

In [37]:
words_df['lemma#pos'] = words_df['lex'] + '#' + words_df['pos']
words_df['readability'] = get_levels(words_df['lemma#pos'])

In [38]:
words_df[:100]

Unnamed: 0,id,pos,lex,word,lemma#pos,readability
0,0,verb,كان,كان,كان#verb,1
1,0,adv,هُناكَ,هناك,هُناكَ#adv,1
2,0,noun,كَثِير,الكثير,كَثِير#noun,0
3,0,prep,مِن,من,مِن#prep,1
4,0,noun,تَحْسِين,التحسينات,تَحْسِين#noun,3
...,...,...,...,...,...,...
95,0,noun,شَكْل,بشكل,شَكْل#noun,1
96,0,adj,جَيِّد,جيد,جَيِّد#adj,1
97,0,punc,،,،,،#punc,0
98,0,pron_dem,هٰذا,هذا,هٰذا#pron_dem,1


In [39]:
# new df
documents_df = pd.DataFrame()

In [40]:
# for each unique id get the counts of readability scores
for i in words_df['id'].unique():
    # divide the counts by the total number of words in the essay
    counts = words_df.loc[words_df['id'] == i, 'readability'].value_counts()
    # add th counts to documents_df
    documents_df = documents_df.append(counts, ignore_index=True)

In [41]:
# add columns to documents_df for ratios of noun, verb, adj, adv, prep, conj, pron
documents_df['noun_ratio'] = 0
documents_df['verb_ratio'] = 0
documents_df['adj_ratio'] = 0
documents_df['adv_ratio'] = 0
documents_df['prep_ratio'] = 0
documents_df['conj_ratio'] = 0
documents_df['pron_ratio'] = 0
documents_df['word_count'] = 0
documents_df['grade'] = grades

# for each unique id get the ratio of noun, verb, adj, adv, prep, conj, pron
for i in words_df['id'].unique():
    counts = words_df.loc[words_df['id'] == i, 'pos'].value_counts()

    # Add the ratios to the corresponding columns in documents_df
    total_count = counts.sum()
    documents_df.loc[i, 'word_count'] = total_count
    if 'noun' in counts:
        documents_df.loc[i, 'noun_ratio'] = counts['noun'] / total_count
    if 'verb' in counts:
        documents_df.loc[i, 'verb_ratio'] = counts['verb'] / total_count
    if 'adj' in counts:
        documents_df.loc[i, 'adj_ratio'] = counts['adj'] / total_count
    if 'adv' in counts:
        documents_df.loc[i, 'adv_ratio'] = counts['adv'] / total_count
    if 'prep' in counts:
        documents_df.loc[i, 'prep_ratio'] = counts['prep'] / total_count
    if 'conj' in counts:
        documents_df.loc[i, 'conj_ratio'] = counts['conj'] / total_count
    if 'pron' in counts:
        documents_df.loc[i, 'pron_ratio'] = counts['pron'] / total_count

In [42]:
# rename 1	0	3	2	4	5 to readability_1 readability_0 readability_3 readability_2 readability_4 readability_5
documents_df.columns = ['readability_1', 'readability_0', 'readability_3', 'readability_2', 'readability_4', 'readability_5', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'adv_ratio', 'prep_ratio', 'conj_ratio', 'pron_ratio', 'word_count', 'grade']

In [43]:
# replace the values of readability_1 to readability_1/word_count
documents_df['readability_1'] = documents_df['readability_1'] / documents_df['word_count']
documents_df['readability_0'] = documents_df['readability_0'] / documents_df['word_count']
documents_df['readability_3'] = documents_df['readability_3'] / documents_df['word_count']
documents_df['readability_2'] = documents_df['readability_2'] / documents_df['word_count']
documents_df['readability_4'] = documents_df['readability_4'] / documents_df['word_count']
documents_df['readability_5'] = documents_df['readability_5'] / documents_df['word_count']

Error Analyser

In [44]:
def count_different_letters(word1, word2):
    count = 0
    length = min(len(word1), len(word2))
    maxi = max(len(word1), len(word2))
    count = maxi - length

    # Count the number of different letters
    count = count + sum(1 for c1, c2 in zip(word1[:length], word2[:length]) if c1 != c2)
    return count

In [45]:
words_df

Unnamed: 0,id,pos,lex,word,lemma#pos,readability
0,0,verb,كان,كان,كان#verb,1
1,0,adv,هُناكَ,هناك,هُناكَ#adv,1
2,0,noun,كَثِير,الكثير,كَثِير#noun,0
3,0,prep,مِن,من,مِن#prep,1
4,0,noun,تَحْسِين,التحسينات,تَحْسِين#noun,3
...,...,...,...,...,...,...
93469,623,conj,وَ,و,وَ#conj,1
93470,623,noun,كَيْفِيَّة,كيفية,كَيْفِيَّة#noun,4
93471,623,noun,ٱِسْتِعْمال,إستعمالهم,ٱِسْتِعْمال#noun,1
93472,623,prep,لِ,لها,لِ#prep,1


In [46]:
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.disambig.mle import MLEDisambiguator

from camel_tools.utils.dediac import dediac_ar

mle = MLEDisambiguator.pretrained()


errors = []
for word in words_df['word']:
        # if word is string
    if isinstance(word, str):
        disambig = mle.disambiguate([str(word)])
        dediac_word = dediac_ar(word)
        dediac_disambig = dediac_ar(disambig[0].analyses[0].analysis['diac'])
        # print(word)
        # print(dediac_disambig)
        if dediac_word == 'و':
            errors.append(2)
        elif dediac_word != dediac_disambig:
            # check how many letters are different
            errors.append(count_different_letters(dediac_word, dediac_disambig))
        else:
            errors.append(0)
    else:
        errors.append(0)

In [47]:
words_df['error'] = errors

In [49]:
words_df.to_csv('words_df.csv', encoding='utf_8',sep='\t')

Error rate per document

In [50]:
errors_df = pd.DataFrame()

# calculate the error rate
documents_df['error_rate'] = words_df.groupby('id')['error'].sum() / documents_df['word_count']

In [51]:
# concat ['augemented'] and zaebuc_df['augemented'] and add it to documents_df
documents_df['augmented'] = augmented_df['augmented'].append(zaebuc_df['augmented']).reset_index(drop=True)

In [55]:
documents_df['Document'] = augmented_df['Document'].append(zaebuc_df['Document']).reset_index(drop=True)

In [57]:
documents_df.to_csv('documents_features.csv', encoding='utf_8',sep='\t')