POS, Readability

In [1]:
import pandas as pd

df = pd.read_csv('ZAEBUC-v1.0/AR-all.alignment-FINAL.tsv', encoding='utf_8',sep='\t')

documents = df['Document']

In [2]:
samer_df = pd.read_csv('./samer-readability-lexicon/SAMER-Readability-Lexicon.tsv', encoding='utf_8',sep='\t')

In [3]:
words_for_documents = {}

for document in set(documents):
    words_for_documents[document] = df.loc[df['Document'] == document , 'Raw'].tolist()

In [5]:
# join sentences
sentences_per_document = {}
sentence = []
document_ids = []

for document_num in words_for_documents:
    document = words_for_documents[document_num]
    doc = []
    for i in range(len(document)):
        if document[i] != document[i]:
            sentence.append('n')
            if i == len(document) - 1:
                doc.append(sentence)
                sentence = []
        else:
            sentence.append(document[i])
            if document[i][-1] == '.' or document[i][-1] == '،':
                doc.append(sentence)
                sentence = []
            elif i == len(document) - 1:
                doc.append(sentence)
                sentence = []
    sentences_per_document[document_num] = doc

HERE
HERE
HERE


In [6]:
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')

msa_d3_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, scheme='atbtok')

DatasetInfo(component='MorphologyDB', name='calima-msa-r13', description='Database for analyzing Modern Standard Arabic', license='GPL v2', version='0.4.0', path=PosixPath('/Users/noor/.camel_tools/data/morphology_db/calima-msa-r13'))


In [7]:
# for each document, tokenize each sentence
tokenized_sentences_per_document = {}
for document_num in sentences_per_document:
    sentences = sentences_per_document[document_num]
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentences.append(msa_d3_tokenizer.tokenize(sentence))
    tokenized_sentences_per_document[document_num] = tokenized_sentences

In [8]:
# do the above but with tokenized_sentences_per_document
fixed_tokenized_sentences_per_document = {}
for document_num in tokenized_sentences_per_document:
    sentences = tokenized_sentences_per_document[document_num]
    fixed_tokenized_sentences = []
    for sentence in sentences:
        new_sentence = []
        for i in range(len(sentence)):
            if '+' in sentence[i]:
                a = sentence[i].replace('_', '').split('+')
                new_sentence += a[:-1]
                if sentence[i][-1] == '.':
                    new_sentence.append(a[-1][:-1])
                    new_sentence.append('.')
                else:
                    new_sentence.append(a[-1])
            else:
                if sentence[i] == '.':
                    new_sentence.append('.')
                elif sentence[i][-1] == '.':
                    new_sentence.append(sentence[i][:-1])
                    new_sentence.append('.')
                else:
                    new_sentence.append(sentence[i])
        fixed_tokenized_sentences.append(new_sentence)

        
    fixed_tokenized_sentences_per_document[document_num] = fixed_tokenized_sentences

In [9]:
pos_tagged_sentences = []
lex_tagged_sentences = []
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tagger.default import DefaultTagger

mled = MLEDisambiguator.pretrained()
pos_tagger = DefaultTagger(mled, 'pos')
lex_tagger = DefaultTagger(mled, 'lex')

# for each document, tag each sentence
pos_tagged_sentences_per_document = {}
lex_tagged_sentences_per_document = {}

for document_num in fixed_tokenized_sentences_per_document:
    sentences = fixed_tokenized_sentences_per_document[document_num]
    pos_tagged_sentences = []
    lex_tagged_sentences = []
    for i in range(len(sentences)):
        pos_tagged_sentences.append(pos_tagger.tag(sentences[i]))
        lex_tagged_sentences.append(lex_tagger.tag(sentences[i]))
    pos_tagged_sentences_per_document[document_num] = pos_tagged_sentences
    lex_tagged_sentences_per_document[document_num] = lex_tagged_sentences

DatasetInfo(component='MorphologyDB', name='calima-msa-r13', description='Database for analyzing Modern Standard Arabic', license='GPL v2', version='0.4.0', path=PosixPath('/Users/noor/.camel_tools/data/morphology_db/calima-msa-r13'))


In [10]:
sentences_df = []

for document_num in pos_tagged_sentences_per_document:
    sentences = pos_tagged_sentences_per_document[document_num]
    lex_sentences = lex_tagged_sentences_per_document[document_num]
    for i in range(len(sentences)):
        sentences_df.append({'Document': document_num, 'Sentence': fixed_tokenized_sentences_per_document[document_num][i], 'POS_of_sentence': sentences[i], 'LEX_of_Sentence': lex_sentences[i]})
        

In [11]:
# create a df with the above but each word alone
words_df = []
for document_num in pos_tagged_sentences_per_document:
    sentences = pos_tagged_sentences_per_document[document_num]
    lex_sentences = lex_tagged_sentences_per_document[document_num]
    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            words_df.append({'Document': document_num, 'Sentence': fixed_tokenized_sentences_per_document[document_num][i][j], 'POS': sentences[i][j], 'LEX': lex_sentences[i][j]})

In [12]:
words_df = pd.DataFrame(words_df)

In [14]:
sentences_df = pd.DataFrame(sentences_df)

In [15]:
sentences_df.to_csv('sentences_features.csv', encoding='utf_8',sep='\t')

In [16]:
def get_levels(lemmas_pos):
    levels = []
    for lemma in lemmas_pos:
        level = samer_df.loc[samer_df['lemma#pos'] == lemma , 'readability (rounded average)']
        if level.empty:
            # add the readability score to the dataframe
            levels.append(0)
            
        else:
            levels.append(level.values[0])
    return levels

In [17]:
words_df['lemma#pos'] = words_df['LEX'] + '#' + words_df['POS']
words_df['readability'] = get_levels(words_df['lemma#pos'])

words_df.to_csv('words_features.csv', encoding='utf_8',sep='\t')

get ratio of POS per document

In [35]:
# for each document, get the ration of 'noun' POS tags
def get_ratio_of_pos(pos):

    pos_ratio_per_document = {}
    for document_num in pos_tagged_sentences_per_document:
        sentences = pos_tagged_sentences_per_document[document_num]
        pos_count = 0
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                if sentences[i][j] == pos:
                    pos_count += 1
        pos_ratio_per_document[document_num] = pos_count/len(words_for_documents[document_num])
        pos_count = 0
    return pos_ratio_per_document

In [60]:
# open error_rates_per_document.csv
error_rates_df = pd.read_csv('error_rates_per_document.csv', encoding='utf_8',sep='\t')

# map get_ratio_of_pos('noun') to the df with the correct document number
nouns_ratio = get_ratio_of_pos('noun')
verbs_ratio = get_ratio_of_pos('verb')
adj_ratio = get_ratio_of_pos('adj')
prep_ratio = get_ratio_of_pos('prep')
conj_ratio = get_ratio_of_pos('conj')
adv_ratio = get_ratio_of_pos('adv')

error_rates_df['ratio_of_nouns'] = error_rates_df['Document'].map(nouns_ratio)
error_rates_df['ratio_of_verbs'] = error_rates_df['Document'].map(verbs_ratio)
error_rates_df['ratio_of_verbs'] = error_rates_df['Document'].map(adj_ratio)
error_rates_df['ratio_of_prep'] = error_rates_df['Document'].map(prep_ratio)
error_rates_df['ratio_of_conj'] = error_rates_df['Document'].map(conj_ratio)
error_rates_df['ratio_of_adv'] = error_rates_df['Document'].map(adv_ratio)

In [58]:
error_rates_df.to_csv('rates_per_document.csv', encoding='utf_8',sep='\t')