# Parser

In [14]:
from parsing import ParserUDpipe
from tqdm import tqdm_notebook as tqdm

In [2]:
parser = ParserUDpipe("They go.")
df = parser.conllu2df()

In [3]:
df.head(30)

Unnamed: 0,Id,Form,Lemma,UPosTag,XPosTag,Feats,Head,DepRel,Deps,Misc
0,1,They,they,PRON,PE,Number=Plur|Person=3|PronType=Prs,2,nsubj,_,_
1,2,go,go,VERB,V,Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin,0,root,_,SpaceAfter=No
2,3,.,.,PUNCT,FS,_,2,punct,_,SpaceAfter=No


In [4]:
# проверить на punct
# сначала спеллчеккер
# разобраться с D
# https://github.com/kristopherkyle/lexical_diversity отсюда взять критерии
# нужно бы чтобы токены и лексемы были все в нижнем регистре

# Lexical Complexity

In [5]:
import collections
import json
import math

open_class = ["NOUN", "VERB", "ADV", "ADJ"]
with open('lists.json') as data_file:
    lists = json.load(data_file)
fivetfrequentCOCA = lists['5000frequentCOCA']
frequentverbsCOCAfromfivet = lists['frequentverbsCOCAfrom5000']
uwl = lists['UWL']

class LexicalComplexity:
    """Returns values of lexical criteria."""
    
    def __init__(self, text):
        self.text = text
    
    def get_verb_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'VERB']['Lemma']
    
    def get_noun_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'NOUN']['Lemma']

    def get_adj_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'ADJ']['Lemma']
    
    def get_adv_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'ADV']['Lemma']
    
    def get_lex_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'].isin(open_class)]['Lemma']
    
    def get_lemmas(self):
        df = parser.conllu2df()
        return df['Lemma']
    
    def get_forms(self):
        df = parser.conllu2df()
        return df['Form']
    
    def safe_divide(self, numerator, denominator):
        if denominator == 0 or denominator == 0.0:
            index = 0
        else: index = numerator/denominator
        return index

    def division(self, list1, list2):
        try:
            return len(list1)/len(list2)
        except:
            return 0
        
    def corrected_division(self, list1, list2):
        try:
            return len(list1)/math.sqrt(2*len(list2))
        except:
            return 0
        
    def root_division(self, list1, list2):
        try:
            return len(list1)/math.sqrt(len(list2))
        except:
            return 0
        
    def squared_division(self, list1, list2):
        try:
            return len(list1)**2/len(list2)
        except:
            return 0
        
    def log_division(self, list1, list2):
        try:
            return math.log(len(list1))/math.log(len(list2))
        except:
            return 0
        
    def uber(self, list1, list2):
        try:
            return math.log(len(list1))**2/math.log(len(set(list2))/len(list1))
        except:
            return 0

    def density(self, punct=False):
        """
        number of lexical tokens/number of tokens
        """
        lex_lemmas = self.get_lex_lemmas()
        lemmas = self.get_lemmas()
        return self.division(lex_lemmas, lemmas)
    
    def LS(self):
        """
        number of sophisticated lexical tokens/number of lexical tokens
        """
        lex_lemmas = self.get_lex_lemmas()
        soph_lex_lemmas = [i for i in lex_lemmas if i not in fivetfrequentCOCA]
        return self.division(soph_lex_lemmas, lex_lemmas)
    
    def VS(self):
        """
        number of sophisticated verb lemmas/number of verb tokens
        """
        verb_lemmas = self.get_verb_lemmas()
        soph_verbs = set([i for i in verb_lemmas if i not in frequentverbsCOCAfromfivet])
        VSI = self.division(soph_verbs, verb_lemmas)
        VSII = self.corrected_division(soph_verbs, verb_lemmas)
        VSIII = self.squared_division(soph_verbs, verb_lemmas)
        return VSI, VSII, VSIII

    def LFP(self):
        """
        Lexical Frequency Profile is the proportion of tokens:
        first - 1000 most frequent words
        second list - the second 1000
        third - University Word List (Xue & Nation 1989)
        none - list of those that are not in these lists
        """
        lemmas = self.get_lemmas()
        first = [i for i in lemmas if i in fivetfrequentCOCA[0:1000]]
        second = [i for i in lemmas if i in fivetfrequentCOCA[1000:2000]]
        third = [i for i in lemmas if i in uwl]
        first_procent = self.division(first, lemmas)
        second_procent = self.division(second, lemmas)
        third_procent = self.division(third, lemmas)
        none = 1 - (first_procent + second_procent + third_procent)
        return first_procent, second_procent , third_procent, none
    
    def NDW(self):
        """
        number of lemmas
        """
        lemmas = self.get_lemmas()
        return len(set(lemmas))
    
    def TTR(self):
        """
        number of lemmas/number of tokens
        """
        lemmas = set(self.get_lemmas())
        tokens = self.get_lemmas()
        TTR = self.division(lemmas, tokens)
        CTTR = self.corrected_division(lemmas, tokens)
        RTTR = self.root_division(lemmas, tokens)
        LogTTR = self.log_division(lemmas, tokens)
        Uber = self.uber(lemmas, tokens)
        return TTR, CTTR, RTTR, LogTTR, Uber

    def choose(self, n, k):
        """
        Calculates binomial coefficients
        """
        if 0 <= k <= n:
            ntok = 1
            ktok = 1
            for t in range(1, min(k, n - k) + 1):
                ntok *= n
                ktok *= t
                n -= 1
            return ntok // ktok
        else:
            return 0
    
    def hyper(self, successes, sample_size, population_size, freq):
        """
        Calculates hypergeometric distribution
        """
        # probability a word will occur at least once in a sample of a particular size
        try:
            prob_1 = 1.0 - (float((self.choose(freq, successes) * 
                                   self.choose((population_size - freq),
                                               (sample_size - successes)))) /
                            float(self.choose(population_size, sample_size)))
            prob_1 = prob_1 * (1/sample_size)
        except ZeroDivisionError:
            prob_1 = 0
        return prob_1
    
    def D(self):
        prob_sum = 0.0
        tokens = self.get_forms()
        num_tokens = len(tokens)
        types_list = list(set(tokens))
        frequency_dict = collections.Counter(tokens)

        for items in types_list:
            # random sample is 42 items in length
            prob = self.hyper(0, 42, num_tokens, frequency_dict[items])
            prob_sum += prob

        return prob_sum

    def LV(self):
        """
        number of lexical lemmas/number of lexical tokens
        """
        lex_lemmas = set(self.get_lex_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return len(lex_lemmas)/len(lex_tokens)
    
    def VV(self):
        """
        VVI: number of verb lemmas/number of verb tokens
        VVII: number of verb lemmas/number of lexical tokens
        """
        verb_lemmas = set(self.get_verb_lemmas())
        verb_tokens = self.get_verb_lemmas()
        lex_tokens = self.get_lex_lemmas()
        VVI = self.division(verb_lemmas, verb_tokens)
        SVVI = self.squared_division(verb_lemmas, verb_tokens)
        CVVI = self.corrected_division(verb_lemmas, verb_tokens)
        VVII = self.division(verb_lemmas, lex_tokens)
        return VVI, SVVI, CVVI, VVII
        
    def NV(self):
        """
        number of noun lemmas/number of lexical tokens
        """
        noun_lemmas = set(self.get_noun_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return self.division(noun_lemmas, lex_tokens)

    def AdjV(self):
        """
        number of adjective lemmas/number of lexical tokens
        """
        adj_lemmas = set(self.get_adj_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return self.division(adj_lemmas, lex_tokens)
    
    def AdvV(self):
        """
        number of adverb lemmas/number of lexical tokens
        """
        adv_lemmas = set(self.get_adv_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return self.division(adv_lemmas, lex_tokens)
    
    def ModV(self):
        return self.AdjV() + self.AdvV()

In [6]:
with open('result_criteria.csv', 'r') as file:
    f = file.read()
paths = ['/Users/irene/Desktop/Диплом/new_data/'+x+'.txt' for x in f.split('\n')]

In [7]:
#paths

In [8]:
import enchant
import enchant.checker
from enchant.checker.CmdLineChecker import CmdLineChecker
def check_spelling(text):
    chkr = enchant.checker.SpellChecker("en_GB")
    chkr.set_text(text)
    for err in chkr:
        sug = err.suggest()[0]
        err.replace(sug)
    c = chkr.get_text()
    return c

In [9]:
with open('/Users/irene/Desktop/Диплом/new_data/1.txt', 'r') as file:
    text = file.read()
text

'The graph contains information about money people spend on petrol. The research was done in the USA and the UK. Three classes were compared: the poorest, the richest and middle-income people.\nResults in two countries are absolutely the opposite. The UK-line gradually goes up, and reaches the peak on the point of 4 per cent. While the USA-line declines from the point of 5,3 per cent to 2,2 per cent. It means that the biggest amount of money is spent in the USA by poorest people. The same class in the UK spends only 0,5 per cent of the income. The difference in part of rich people is modest - about 1 per cent.\nOverall, people from the USA spend bigger part of their income on petrol.'

In [10]:
parser = ParserUDpipe(text)
LC = LexicalComplexity(text)
dict_lex_comp = {'density': LC.density(), 'LS': LC.LS(), 'VSI': LC.VS()[0],
                 'VSII': LC.VS()[1], 'VSIII': LC.VS()[2], 'LFP_first': LC.LFP()[0], 
                 'LFP_second': LC.LFP()[1], 'LFP_third': LC.LFP()[2], 'LFP_none': LC.LFP()[3], 
                 'NDW': LC.NDW(), 'TTR': LC.TTR()[0], 'CTTR': LC.TTR()[1], 'RTTR': LC.TTR()[2], 
                 'LogTTR': LC.TTR()[3], 'Uber': LC.TTR()[4], 'D': LC.D(), 
                 'LV': LC.LV(), 'VVI': LC.VV()[0], 'SVVI': LC.VV()[1], 'CVVI': LC.VV()[2],
                 'VVII': LC.VV()[3], 'NV': LC.NV(), 'AdjV': LC.AdjV(), 'AdvV': LC.AdvV(), 
                 'ModV': LC.ModV()}
lex_criteria = list(dict_lex_comp.keys())

In [11]:
#paths[191:]

In [12]:
#lex_criteria

In [15]:
r = ''
for path in tqdm(paths):
    string = path + ','
    with open(path, 'r') as file:
        text = file.read()
    text = check_spelling(text)
    text = text.replace('\n', ' ')
    parser = ParserUDpipe(text)
    LC = LexicalComplexity(text)
    dict_lex_comp = {'density': LC.density(), 'LS': LC.LS(), 'VSI': LC.VS()[0],
                     'VSII': LC.VS()[1], 'VSIII': LC.VS()[2], 'LFP_first': LC.LFP()[0], 
                     'LFP_second': LC.LFP()[1], 'LFP_third': LC.LFP()[2], 'LFP_none': LC.LFP()[3], 
                     'NDW': LC.NDW(), 'TTR': LC.TTR()[0], 'CTTR': LC.TTR()[1], 'RTTR': LC.TTR()[2], 
                     'LogTTR': LC.TTR()[3], 'Uber': LC.TTR()[4], 'D': LC.D(), 
                     'LV': LC.LV(), 'VVI': LC.VV()[0], 'SVVI': LC.VV()[1], 'CVVI': LC.VV()[2],
                     'VVII': LC.VV()[3], 'NV': LC.NV(), 'AdjV': LC.AdjV(), 'AdvV': LC.AdvV(), 
                     'ModV': LC.ModV()}
    for c in lex_criteria:
        string += str(round(dict_lex_comp[c], 5)) + ','
    string += '\n'
    r += string

A Jupyter Widget




In [16]:
with open('lex_criteria.csv', 'w') as file:
    file.write(r)

In [17]:
lex_criteria

['density',
 'LS',
 'VSI',
 'VSII',
 'VSIII',
 'LFP_first',
 'LFP_second',
 'LFP_third',
 'LFP_none',
 'NDW',
 'TTR',
 'CTTR',
 'RTTR',
 'LogTTR',
 'Uber',
 'D',
 'LV',
 'VVI',
 'SVVI',
 'CVVI',
 'VVII',
 'NV',
 'AdjV',
 'AdvV',
 'ModV']

In [18]:
parser = ParserUDpipe(text)
LC = LexicalComplexity(text)

In [19]:
dict_lex_comp = {'density': LC.density(), 'LS': LC.LS(), 'VSI': LC.VS()[0],
                 'VSII': LC.VS()[1], 'VSIII': LC.VS()[2], 'LFP_first': LC.LFP()[0], 
                 'LFP_second': LC.LFP()[1], 'LFP_third': LC.LFP()[2], 'LFP_none': LC.LFP()[3], 
                 'NDW': LC.NDW(), 'TTR': LC.TTR()[0], 'CTTR': LC.TTR()[1], 'RTTR': LC.TTR()[2], 
                 'LogTTR': LC.TTR()[3], 'Uber': LC.TTR()[4], 'D': LC.D(), 
                 'LV': LC.LV(), 'VVI': LC.VV()[0], 'SVVI': LC.VV()[1], 'CVVI': LC.VV()[2],
                 'VVII': LC.VV()[3], 'NV': LC.NV(), 'AdjV': LC.AdjV(), 'AdvV': LC.AdvV(), 
                 'ModV': LC.ModV()}  

In [20]:
dict_lex_comp

{'AdjV': 0.21739130434782608,
 'AdvV': 0.043478260869565216,
 'CTTR': 4.571428571428571,
 'CVVI': 2.456769074559977,
 'D': 0.8416985044788518,
 'LFP_first': 0.8163265306122449,
 'LFP_none': 0.04081632653061229,
 'LFP_second': 0.09183673469387756,
 'LFP_third': 0.05102040816326531,
 'LS': 0.043478260869565216,
 'LV': 0.8695652173913043,
 'LogTTR': 0.9070692655306586,
 'ModV': 0.2608695652173913,
 'NDW': 64,
 'NV': 0.34782608695652173,
 'RTTR': 6.464976285134148,
 'SVVI': 12.071428571428571,
 'TTR': 0.6530612244897959,
 'Uber': 0,
 'VSI': 0.0,
 'VSII': 0.0,
 'VSIII': 0.0,
 'VVI': 0.9285714285714286,
 'VVII': 0.2826086956521739,
 'density': 0.46938775510204084}

# Morphological Complexity

In [21]:
# лучше выделять суффиксы
# научиться выделять приставки
# MCI: went -> ed

In [22]:
import collections
import json
import math
import random

from nltk.stem.porter import *
porter_stemmer = PorterStemmer()

with open('suffixes.json') as data_file:
    suffixes_levels = json.load(data_file)

class MorphologicalComplexity:
    """Returns values of morphological criteria."""
    
    def __init__(self, text):
        self.text = text

    def get_forms(self):
        df = parser.conllu2df()
        return df['Form']
   
    def get_inf(self):
        df = parser.conllu2df()
        return df[df['Feats'].str.match('.*VerbForm=Inf.*')]
    
    def get_gerund(self):
        df = parser.conllu2df()
        return df[df['Feats'].str.match('.*VerbForm=Ger.*')]
    
    def get_pres_sg(self):
        df = parser.conllu2df()
        return df[df['Feats'].str.match('.*Mood=Ind.+Number=Sing.+Person=3.+Tense=Pres.+VerbForm=Fin.*')]
    
    def get_pres_pl(self):
        df = parser.conllu2df()
        return df[df['Feats'].str.match('.*Mood=Ind.+Number=Plur.+Tense=Pres.+VerbForm=Fin.*')]
    
    def get_part(self):
        df = parser.conllu2df()
        return df[df['Feats'].str.match('.*Tense=Past.+VerbForm=Part.*')]
    
    def get_past(self):
        df = parser.conllu2df()
        return df[df['Feats'].str.match('.*Mood=Ind.+Person=3.+Tense=Past.+VerbForm=Fin.*')]     
        
    def get_verb_forms(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'VERB']['Form']
    
    def get_verb_feats(self):
        df = parser.conllu2df()
        return df[df['Feats'].str.contains('VerbForm=')]['Feats']
    
    def get_aux(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'AUX']['Form']
    
    def division(self, list1, list2):
        try:
            return len(list1)/len(list2)
        except:
            return 0
        
    def one_random_list(self, l, length):
        result = []
        for i in range(length):
            random_element = random.choice(l)
            l.remove(random_element)
            result.append(random_element)
        return result, l

    def two_random_lists(self, l, length=10):
        list1, list2 = [], []
        if len(l) < length*2:
            return self.two_random_lists(l, length=length-1)
        else:
            list1, l = self.one_random_list(l, length)
            list2, l = self.one_random_list(l, length)
            return list1, list2
        
    def num_uniques(self, l):
        counter=collections.Counter(l)
        return list(counter.values()).count(1)

    def get_suffix(self, word):
        root = porter_stemmer.stem(word)
        suffix = word[len(root):]
        return suffix
    
    def get_suffixes(self):
        forms = self.get_forms()
        suffixes = [self.get_suffix(word) for word in forms]
        return list(filter(lambda s: s != "", suffixes))
        
    def derivational_suffixation(self):
        """
        number of suffixes on n's level/number of suffixes
        """
        suffixes = self.get_suffixes()
        level3_suffixes = [i for i in suffixes if i in suffixes_levels["level3"]]
        level4_suffixes = [i for i in suffixes if i in suffixes_levels["level4"]]
        level5_suffixes = [i for i in suffixes if i in suffixes_levels["level5"]]
        level6_suffixes = [i for i in suffixes if i in suffixes_levels["level6"]]
        der_suff3 = self.division(level3_suffixes, suffixes)
        der_suff4 = self.division(level4_suffixes, suffixes)
        der_suff5 = self.division(level5_suffixes, suffixes)
        der_suff6 = self.division(level6_suffixes, suffixes)
        return der_suff3, der_suff4, der_suff5, der_suff6
    
    def MCI(self):
        """
        MCI represents the average inflectional diversity for the parts of speech in the sample
        """
        verb_forms = self.get_verb_forms()
        suff_verb = [self.get_suffix(verb) for verb in verb_forms]
        list1, list2 = self.two_random_lists(suff_verb)
        diversity1=len(set(list1))
        diversity2=len(set(list2))
        mean_diversity = (diversity1+diversity2)/2
        num_uni = self.num_uniques(list1+list2)
        IUV = num_uni/2
        MCI = mean_diversity + IUV/2 - 1
        return MCI
    
    def freq_finite_forms(self):
        """
        frequency of tensed(finite) forms
        """
        verb_feats = self.get_verb_feats()
        finite_forms = [word for word in verb_feats if "VerbForm=Fin" in word]
        return self.division(finite_forms, verb_feats)
    
    def freq_aux(self):
        """
        frequency of modals(auxilaries)
        """
        verb_feats = self.get_verb_feats()
        aux = self.get_aux()
        return self.division(aux, verb_feats)

    def num_verb_forms(self):
        """
        number of different verb forms: 
        infinitives, gerunds, present singular, present plural, past participle, past simple
        """
        inf = self.get_inf()
        gerund = self.get_gerund()
        pres_sg = self.get_pres_sg()
        pres_pl = self.get_pres_pl()
        part = self.get_part()
        past = self.get_past()
        return len(inf), len(gerund), len(pres_sg), len(pres_pl), len(part), len(past)

In [23]:
parser = ParserUDpipe(text)
MC = MorphologicalComplexity(text)
dict_morph_comp = {'der_suff_level3': MC.derivational_suffixation()[0], 
                   'der_suff_level4': MC.derivational_suffixation()[1], 
                   'der_suff_level5': MC.derivational_suffixation()[2], 
                   'der_suff_level6': MC.derivational_suffixation()[3],
                   'MCI': MC.MCI(), 'freq_finite_forms': MC.freq_finite_forms(), 
                   'freq_aux': MC.freq_aux(), 'num_inf': MC.num_verb_forms()[0], 
                   'num_inf': MC.num_verb_forms()[0], 'num_gerund': MC.num_verb_forms()[1], 
                   'num_pres_sg': MC.num_verb_forms()[2], 'num_pres_pl': MC.num_verb_forms()[3], 
                   'num_part': MC.num_verb_forms()[4], 'num_past': MC.num_verb_forms()[5]} 
morph_criteria = list(dict_morph_comp.keys())

In [24]:
r = ''
for path in tqdm(paths):
    string = path + ','
    with open(path, 'r') as file:
        text = file.read()
    text = check_spelling(text)
    text = text.replace('\n', ' ')
    parser = ParserUDpipe(text)
    MC = MorphologicalComplexity(text)
    dict_morph_comp = {'der_suff_level3': MC.derivational_suffixation()[0], 
                       'der_suff_level4': MC.derivational_suffixation()[1], 
                       'der_suff_level5': MC.derivational_suffixation()[2], 
                       'der_suff_level6': MC.derivational_suffixation()[3],
                       'MCI': MC.MCI(), 'freq_finite_forms': MC.freq_finite_forms(), 
                       'freq_aux': MC.freq_aux(), 'num_inf': MC.num_verb_forms()[0], 
                       'num_inf': MC.num_verb_forms()[0], 'num_gerund': MC.num_verb_forms()[1], 
                       'num_pres_sg': MC.num_verb_forms()[2], 'num_pres_pl': MC.num_verb_forms()[3], 
                       'num_part': MC.num_verb_forms()[4], 'num_past': MC.num_verb_forms()[5]}  
    for c in morph_criteria:
        string += str(round(dict_morph_comp[c], 5)) + ','
    string += '\n'
    r += string

A Jupyter Widget




In [25]:
with open('morph_criteria.csv', 'w') as file:
    file.write(r)

In [26]:
morph_criteria

['der_suff_level3',
 'der_suff_level4',
 'der_suff_level5',
 'der_suff_level6',
 'MCI',
 'freq_finite_forms',
 'freq_aux',
 'num_inf',
 'num_gerund',
 'num_pres_sg',
 'num_pres_pl',
 'num_part',
 'num_past']

In [13]:
parser = ParserUDpipe(text)
MC = MorphologicalComplexity(text)

In [14]:
dict_morph_comp = {'der_suff_level3': MC.derivational_suffixation()[0], 
                   'der_suff_level4': MC.derivational_suffixation()[1], 
                   'der_suff_level5': MC.derivational_suffixation()[2], 
                   'der_suff_level6': MC.derivational_suffixation()[3],
                   'MCI': MC.MCI(), 'freq_finite_forms': MC.freq_finite_forms(), 
                   'freq_aux': MC.freq_aux(), 'num_inf': MC.num_verb_forms()[0], 
                   'num_inf': MC.num_verb_forms()[0], 'num_gerund': MC.num_verb_forms()[1], 
                   'num_pres_sg': MC.num_verb_forms()[2], 'num_pres_pl': MC.num_verb_forms()[3], 
                   'num_part': MC.num_verb_forms()[4], 'num_past': MC.num_verb_forms()[5]}  

In [15]:
dict_morph_comp

{'MCI': 2.75,
 'der_suff_level3': 0.038461538461538464,
 'der_suff_level4': 0.038461538461538464,
 'der_suff_level5': 0.07692307692307693,
 'der_suff_level6': 0.0,
 'freq_aux': 0.3333333333333333,
 'freq_finite_forms': 0.8,
 'num_gerund': 0,
 'num_inf': 0,
 'num_part': 3,
 'num_past': 1,
 'num_pres_pl': 2,
 'num_pres_sg': 8}

# Syntactic Complexity

number of tokens, minimum/maximum/average depth of the sentence, number of relative clauses, number of adverbial clauses, number of modifier clauses, number of sentences, number of clauses, number of T-units, number of complex T-units, number of coordinate phrases, number of noun phrases (possessive structures,  prepositional phrases, infinitives or gerunds in the position of object or subject, phrases like “adjective + noun”, “participle + noun”, “noun + infinitive”), number of complex noun phrases, number of verb phrases, Coordination Index, variety of constructions, average number of tokens before the root of the sentence, mean length of the sentence, mean length of the clause, number of clauses per sentence, number of clauses per T-unit, number of dependent clauses per clause, number of dependent clauses per T-unit, number of coordinate phrases per clause, number of T-units per sentence, number of possessive structures per sentence,  number of prepositional phrases per sentence, number of infinitives or gerunds in the position of object or subject per sentence, number of phrases like “adjective + noun”, “participle + noun”, “noun + infinitive” per sentence,  number of verb phrases per sentence

In [18]:
# дописать класс
# дополнить критерием

In [16]:
import copy
import numpy as np
import collections

class SyntacticComplexity:
    """Returns values of syntactical criteria."""
    
    def __init__(self, text):
        self.text = text

    def get_forms(self):
        df = parser.conllu2df()
        return df['Form']
    
    def num_tokens(self):
        return len(self.get_forms())

    def order_head(self, sent):
        _id = sent['Id']
        _head = sent['Head']
        _form = sent['Form']
        return(list(zip(_id, _head)))
    
    def get_dep_rel(self):
        df = parser.conllu2df()
        return df['DepRel']

    def find_root(self, order_head_lst):
        for every_order_head in order_head_lst:
            if every_order_head[1] == 0:
                root = every_order_head
        return root

    def root_children(self, sent):
        order_head_lst = self.order_head(sent)
        root = self.find_root(order_head_lst)
        chains = []
        for every_order_head in order_head_lst:
            if every_order_head[1] == root[0]:
                chains.append([root[0], every_order_head[0]])
        return chains, order_head_lst

    def chains_heads(self, chains, order_head_lst):
        length_chains = len(chains)
        i = 0
        for chain in chains:
            if i < length_chains:
                heads = []
                if 'stop' not in chain:
                    for order_head in order_head_lst:
                        if chain[-1] == order_head[1]:
                            heads.append(order_head[0])
                    if heads == [] and 'stop' not in chain:
                        chain.append('stop')
                    else:
                        ind_head = 0
                        for head in heads:
                            new_chain = copy.copy(chain)[:-1]
                            if ind_head == 0:
                                chain.append(head)
                                ind_head += 1
                            else:
                                new_chain.append(head)
                                chains.append(new_chain)
            i += 1
        while all(item[-1] == 'stop' for item in chains) is False:
            self.chains_heads(chains, order_head_lst)
        return chains

    def count_depth_for_one_sent(self, sent):
        chains, order_head_lst = self.root_children(sent)
        chains = self.chains_heads(chains, order_head_lst)
        depths = []
        for chain in chains:
            depths.append(len(chain)-2)
        return max(depths)

    def count_depths(self):
        max_depths = []
        sentances, df_sentences = parser.conllu2df(sentences=True)
        for sent in df_sentences:
            max_depths.append(self.count_depth_for_one_sent(sent))
        return max_depths
    
    def av_depth(self):
        max_depths = self.count_depths()
        return round(np.mean(max_depths), 2)
    
    def max_depth(self):
        max_depths = self.count_depths()
        return round(np.max(max_depths),2)
    
    def min_depth(self):
        max_depths = self.count_depths()
        return round(np.min(max_depths), 2)
    
    def find_in_dict(self, d, v):
        try:
            return d[v]
        except:
            return 0
        
    def count_dep_sent(self):
        dep_rel = self.get_dep_rel()           
        dict_dep_rel = collections.Counter(dep_rel)
        acl = self.find_in_dict(dict_dep_rel, 'acl')
        rel_cl = self.find_in_dict(dict_dep_rel, 'acl:relcl')
        advcl = self.find_in_dict(dict_dep_rel, 'advcl')
        return acl, rel_cl, advcl
    
    def count_sent(parsed_text):
        sentances, df_sentences = parser.conllu2df(sentences=True)
        return len(sentances)

In [21]:
from parsing import ParserUDpipe
with open('/Users/irene/Desktop/Диплом/new_data/1.txt', 'r') as file:
    text = file.read()
text
#parser = ParserUDpipe(text)
#SC = SyntacticComplexity(text)
#SC.count_dep_sent()

'The graph contains information about money people spend on petrol. The research was done in the USA and the UK. Three classes were compared: the poorest, the richest and middle-income people.\nResults in two countries are absolutely the opposite. The UK-line gradually goes up, and reaches the peak on the point of 4 per cent. While the USA-line declines from the point of 5,3 per cent to 2,2 per cent. It means that the biggest amount of money is spent in the USA by poorest people. The same class in the UK spends only 0,5 per cent of the income. The difference in part of rich people is modest - about 1 per cent.\nOverall, people from the USA spend bigger part of their income on petrol.'

# Rhetorical Complexity

In [27]:
# научиться выделять discourse-organising nouns

In [28]:
import json
with open('linkings.json') as data_file:
    linkings = json.load(data_file)

with open('ngrams.txt') as data_file:
    ngrams = [x.split() for x in data_file.read().split('\n')]
    
with open('functional_ngrams.json') as data_file:
    func_ngrams = json.load(data_file)
    
import re

    
class RhetoricalComplexity:
    """Returns values of rhetorical criteria."""
    
    def __init__(self, text):
        self.text = text
        
    def get_forms(self):
        df = parser.conllu2df()
        return df['Form']

    def subfinder(self, mylist, pattern):
        matches = []
        for i in range(len(mylist)):
            if mylist[i] == pattern[0] and mylist[i:i+len(pattern)] == pattern:
                matches.append(pattern)
        return matches
    
    def num_dict_2_levels(self, d, prefix):
        num_all = 0
        result = {}
        for group in d:
            num_group = 0
            #print(d[group])
            for subgroup in d[group]:
                num_subgroup = 0
                name_subgroup = list(subgroup.keys())[0]
                for word in list(subgroup.values())[0]:
                    num = len(re.findall(word.lower(), self.text.lower()))
                    num_all += num
                    num_subgroup += num
                    num_group += num
                    result[prefix+name_subgroup+"("+word+")"] = num
                result[prefix+name_subgroup] = num_subgroup
            result[prefix+group] = num_group
        result[prefix+'all'] = num_all
        return result
    
    def num_linkings(self):
        """
        number of linking phrases (Swales & Feak 2009)
        """
        num_links_d = self.num_dict_2_levels(linkings, 'link_')
        return num_links_d
    
    def num_4grams(self):
        """
        """
        num_all = 0
        for ngram in ngrams:
            num = len(self.subfinder([x.lower() if type(x) == str else x for x in self.get_forms()], ngram))
            num_all += num
        return num_all
    
    def num_func_ngrams(self):
        """
        number of linking phrases (Swales & Feak 2009)
        """
        num_links_d = self.num_dict_2_levels(func_ngrams, '4grams_')
        return num_links_d

In [29]:
parser = ParserUDpipe(text)
RC = RhetoricalComplexity(text)
_func_ngrams = RC.num_func_ngrams()

In [30]:
num_grams = {'num_4grams': RC.num_4grams()}
_func_ngrams = RC.num_func_ngrams()
num_linkings = RC.num_linkings()
_dict_rhet_comp = {**num_linkings, **_func_ngrams}
dict_rhet_comp = {**_dict_rhet_comp, **num_grams}

In [31]:
rhet_criteria = list(dict_rhet_comp.keys())

In [32]:
r = ''
for path in tqdm(paths):
    string = path + ','
    with open(path, 'r') as file:
        text = file.read()
    text = check_spelling(text)
    text = text.replace('\n', ' ')
    parser = ParserUDpipe(text)
    RC = RhetoricalComplexity(text)
    num_grams = {'num_4grams': RC.num_4grams()}
    _func_ngrams = RC.num_func_ngrams()
    num_linkings = RC.num_linkings()
    _dict_rhet_comp = {**num_linkings, **_func_ngrams}
    dict_rhet_comp = {**_dict_rhet_comp, **num_grams} 
    for c in rhet_criteria:
        string += str(dict_rhet_comp[c]) + ','
    string += '\n'
    r += string

A Jupyter Widget




In [35]:
string

'/Users/irene/Desktop/Диплом/new_data/259.txt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,5,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\n'

In [33]:
with open('rhet_criteria.csv', 'w') as file:
    file.write(r)

In [34]:
rhet_criteria

['link_Sentence connectors(Furthermore)',
 'link_Sentence connectors(In addition)',
 'link_Sentence connectors(Moreover)',
 'link_Sentence connectors',
 'link_Phrases linkers(In addition)',
 'link_Phrases linkers',
 'link_Addition',
 'link_subordinators(Although)',
 'link_subordinators(Even though)',
 'link_subordinators',
 'link_Sentence connectors(However)',
 'link_Sentence connectors(Nevertheless)',
 'link_Phrase linkers(despite)',
 'link_Phrase linkers(In spite of)',
 'link_Phrase linkers',
 'link_Adversativity',
 'link_subordinators(Because)',
 'link_subordinators(since)',
 'link_Sentence connectors(Therefore)',
 'link_Sentence connectors(As a result)',
 'link_Sentence connectors(consequently)',
 'link_Sentence connectors(Hence)',
 'link_Sentence connectors(Thus)',
 'link_Phrase linkers(Because of)',
 'link_Phrase linkers(Due to)',
 'link_Phrase linkers(As a result of)',
 'link_Cause and effect',
 'link_Sentence connectors(In other words)',
 'link_Sentence connectors(That is)',
 '

In [16]:
from parsing import ParserUDpipe
#with open('/Users/irene/Desktop/Диплом/new_data/1.txt', 'r') as file:
#    text = file.read()
text = 'I have a large amount of fruits.'
parser = ParserUDpipe(text)
RC = RhetoricalComplexity(text)
RC.num_func_ngrams()

and to be a
are more and more
as well as the
but there are still
can be divided into
how to deal with
if you don’t know
if you want to
in order to make
is a kind of
is based on the
is more important than
is totally different from
it is a good
it is a very
it is also a
it is because the
it is not a
on the other hand
there will be a
to cope with the
want to be a
my point of view
the best way to
a very important role
as far as the
as I have mentioned
him or her to
is one of my
is one of the
is the most important
is very important for
it is very important
one of the most
the most important thing
we can say that
we can see that
we can see the
I am going to
I would like to
if there is a
a great deal of
a great number of
a large amount of
a lot of people
a lot of problem
a lot of problems
a lot of time
all of them are
and a lot of
bring a lot of
has a lot of
has a lot of
more and more people
most of the people
most of them are
some of them are
that it is more
the rest of the
the rest of the w

{'Attitudinal/modality': 0,
 'Attitudinal/modality(I hope I can)': 0,
 'Attitudinal/modality(are not allowed to)': 0,
 'Attitudinal/modality(is very important to)': 0,
 'Attitudinal/modality(it is difficult to)': 0,
 'Attitudinal/modality(it is hard to)': 0,
 'Attitudinal/modality(it is not easy)': 0,
 'Attitudinal/modality(it is very difficult)': 0,
 'Attitudinal/modality(necessary for us to)': 0,
 'Attitudinal/modality(should learn how to)': 0,
 'Attitudinal/modality(will not be able to)': 0,
 'Discourse organizers': 0,
 'Epistemic': 0,
 'Epistemic(I think it is)': 0,
 'Epistemic(I think that this)': 0,
 'Epistemic(I think the most)': 0,
 'Epistemic(I think this is)': 0,
 'Epistemic(as a matter of)': 0,
 'Epistemic(as we all know)': 0,
 'Epistemic(become more and more)': 0,
 'Epistemic(it is believed that)': 0,
 'Epistemic(it is obvious that)': 0,
 'Epistemic(it is true that)': 0,
 'Epistemic(some people think that)': 0,
 'Framing': 0,
 'Framing(as a result of)': 0,
 'Framing(as the 

# Частотные 4-граммы из REALEC

In [34]:
from parsing import ParserUDpipe
from nltk import FreqDist
from nltk.util import ngrams
import os
from tqdm import tqdm_notebook as tqdm

In [2]:
paths = []
for root, dirs, files in os.walk('/Users/irene/Downloads/data'):
    for name in files:
        if name.endswith(".txt"):
            path = root+'/'+name
            paths.append(path)

In [3]:
len(paths)

14359

In [4]:
d_freq = {}
len_corpus = 0
for path in tqdm(paths):
    with open(path, 'r') as file:
        text = file.read()
    parser = ParserUDpipe(text)
    sents, dfs = parser.conllu2df(sentences=True)
    for sent in dfs:
        tokens = [x.lower() if type(x) == str else x for x in sent['Form']]     
        len_corpus += len(tokens)
        grams = ngrams(tokens, 4)
        for gram in grams:
            gram = ' '.join([str(g) for g in gram])
            if gram in d_freq:
                d_freq[gram] += 1
            else:
                d_freq[gram] = 1

A Jupyter Widget

  if sys.path[0] == '':





In [5]:
#sorted_by_value = sorted(d_freq.items(), key=lambda kv: kv[1], reverse=True)

In [6]:
len_corpus

4088807

In [7]:
len(d_freq.keys())

1494911

In [None]:
# Отобрали с частотностью >= 200

In [8]:
import json

import io
try:
    to_unicode = unicode
except NameError:
    to_unicode = str

# Write JSON file
data = sorted(d_freq.items(), key=lambda kv: kv[1], reverse=True)
with io.open('sorted_ngrams_REALEC.json', 'w', encoding='utf8') as outfile:
    str_ = json.dumps(data,
                      indent=4, sort_keys=True,
                      separators=(',', ': '), ensure_ascii=False)
    outfile.write(to_unicode(str_))

# Read JSON file
with open('sorted_ngrams_REALEC.json') as data_file:
    data_loaded = json.load(data_file)

print(data == data_loaded)

False


In [10]:
with open('sorted_ngrams_REALEC.json') as data_file:
    data_loaded = json.load(data_file)

In [11]:
data_loaded[0]

['to sum up ,', 2306]

In [14]:
file = open('REALECngrams.txt', 'a')
for x in data_loaded:
    if x[1] >= 200:
        file.write(x[0]+'\n')
    else:
        break
file.close()

In [17]:
with open('REALECngrams.txt', 'r') as file:
    f1 = file.read()
with open('ChenBakergrams.txt', 'r') as file:
    f2 = file.read()
with open('ngrams.txt', 'w') as file:
    file.write(f1+'\n'+f2)

# Запись списков

In [65]:
#with open('5000frequentCOCA.csv', 'r') as file:
#    f = file.read()
#f = f.replace('  ', '')
#with open('5000frequentCOCA.csv', 'w') as file:
#    file.write(f)

In [41]:
import pandas as pd

In [42]:
df1 = pd.read_csv('5000frequentCOCA.csv')
df2 = pd.read_csv('frequentverbsCOCAfrom5000.csv')
with open('UWL.txt', 'r') as file:
    f = file.read()
uwl = f.split()

In [43]:
import json

import io
try:
    to_unicode = unicode
except NameError:
    to_unicode = str

# Write JSON file
data = {'5000frequentCOCA': list(df1['Word']), 
        'frequentverbsCOCAfrom5000': list(df1['Word']), 
        'UWL': uwl}
with io.open('lists.json', 'w', encoding='utf8') as outfile:
    str_ = json.dumps(data,
                      indent=4, sort_keys=True,
                      separators=(',', ': '), ensure_ascii=False)
    outfile.write(to_unicode(str_))

# Read JSON file
with open('lists.json') as data_file:
    data_loaded = json.load(data_file)

print(data == data_loaded)

True


In [None]:
# проверить ord/lemma/token/verb

In [66]:
df = pd.read_csv('5000frequentCOCA.csv')

In [67]:
df.head()

Unnamed: 0,Rank,Word,Part_of_speech,Frequency,Dispersion
0,1,the,a,22038615,0.98
1,2,be,v,12545825,0.97
2,3,and,c,10741073,0.99
3,4,of,i,10343885,0.97
4,5,a,a,10144200,0.98
