# Критерии syntactic complexity

### 1. Количество слов

In [108]:
from model import Model
import re
import copy
import numpy as np
from spellchecker import check_spelling
from statistics import mean

#### Функция, которая добавляет пробелы после .?!

In [109]:
def space(string):
    string = re.sub('([a-zA-Z]| )([\.\?!])', '\\1\\2 ', string)
    string = re.sub(': ', ' : ', string)
    string = re.sub('; ', ' ; ', string)
    string = re.sub('  +', ' ', string)
    return string

#### Функция, которая парсит текст udpipe. На выходе получается строка с форматом 'conllu'.

In [110]:
def get_parsed_text(model_name, text_path):
    model = Model(model_name)
    text = open(text_path, 'r')
    text = text.read()
    text = check_spelling(text)
    text = space(text)
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    output = model.write(sentences, "conllu")
    return output

#### Функция, которая считает токены в тексте. Не считаются знаки препинания.

In [111]:
def count_tokens(parsed_text):
    num_tokens = 0
    lst_str = parsed_text.split('\n')
    for every_str in lst_str:
        #print(every_str)
        if ('PUNCT' not in every_str) and (every_str.startswith('#') == False) and (every_str != ''):
            #print(every_str)
            num_tokens += 1
            #print(num_tokens)
    return num_tokens

In [112]:
#count_tokens(parsed_text)

### 2. Глубина дерева

#### Функция, которая считает глубину дерева.

In [113]:
def order_head(parsed_sent):
    sent_lst = parsed_sent.split('\n')
    #print(sent_lst[0])
    order_head_lst = []
    for token in sent_lst:
        token = re.sub(r'\|', '$', token)
        if '\t_\t_\t_\t_\t_' not in token:
            order = re.search('([0-9]+)\t', token).group(1)
        #print(order)
            head = re.search('.+\t.+\t([0-9]+)', token).group(1)
        #print(head)
            token = re.search('^[0-9]+\t(.+?)\t', token).group(1)
            order_head_lst.append((int(order), int(head), token))
    return order_head_lst

In [114]:
def find_root(order_head_lst):
    for every_order_head in order_head_lst:
        if every_order_head[1] == 0:
            root = every_order_head
    return root

In [115]:
def root_children(parsed_sent):
    order_head_lst = order_head(parsed_sent)
    #print(order_head_lst)
    root = find_root(order_head_lst)
    chains = []
    for every_order_head in order_head_lst:
        if every_order_head[1] == root[0]:
            chains.append([root[0], every_order_head[0]])
    return chains, order_head_lst

In [116]:
def chains_heads(chains, order_head_lst):
    length_chains = len(chains)
    i = 0
    for chain in chains:
        if i < length_chains:
            heads = []
            if 'stop' not in chain:
                for order_head in order_head_lst:
                    if chain[-1] == order_head[1]:
                        heads.append(order_head[0])
                if heads == [] and 'stop' not in chain:
                    chain.append('stop')
                else:
                    ind_head = 0
                    for head in heads:
                        new_chain = copy.copy(chain)[:-1]
                        if ind_head == 0:
                            chain.append(head)
                            ind_head += 1
                        else:
                            new_chain.append(head)
                            chains.append(new_chain)
        i += 1
    while all(item[-1] == 'stop' for item in chains) is False:
        chains_heads(chains, order_head_lst)
    return chains

In [117]:
def count_max_depth_for_one_sent(sent):
    chains, order_head_lst = root_children(sent)
    chains = chains_heads(chains, order_head_lst)
    depths = []
    #print(chains)
    for chain in chains:
        depths.append(len(chain)-2)
    return max(depths)

In [118]:
def count_depths_for_one_text(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    max_depths = []
    for sent in sent_lst:
        #print(sent)
        max_depths.append(count_max_depth_for_one_sent(sent))
    return max_depths

In [119]:
def av_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return round(np.mean(max_depths), 2)

In [120]:
def max_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return round(np.max(max_depths),2)

In [121]:
def min_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return round(np.min(max_depths), 2)

In [122]:
#av_depth_for_one_text(parsed_text)

In [123]:
#max_depth_for_one_text(parsed_text)

In [124]:
#min_depth_for_one_text(parsed_text)

### 3. Количество зависимых клауз: acl, acl:relcl, advcl

In [125]:
# Возвращает словарь, где ключ - номер предложения, значения - массив [кол-во acl, кол-во acl:relcl, кол-во advcl]
def count_dependent_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    num_sent = 1
    d_sent = {}
    for sent in sent_lst:
        acl = len(re.findall('\t(acl)\t',sent))
        relcl = len(re.findall('\t(acl:relcl)\t',sent))
        advcl = len(re.findall('\t(advcl)\t',sent))
        d_sent[num_sent] = [acl, relcl, advcl]
        num_sent += 1
    return d_sent

In [126]:
def count_dependent_sent_text(parsed_text):
    d_sent = count_dependent_sent(parsed_text)
    acl = 0
    rel_cl = 0
    advcl = 0
    for sent in d_sent:
        acl = acl + d_sent[sent][0]
        rel_cl = rel_cl + d_sent[sent][1]
        advcl = advcl + d_sent[sent][2]
    return acl, rel_cl, advcl

In [127]:
def count_acl(parsed_text):
    acl = count_dependent_sent_text(parsed_text)[0]
    return acl

In [128]:
def count_acl_relcl(parsed_text):
    acl_relcl = count_dependent_sent_text(parsed_text)[1]
    return acl_relcl

In [129]:
def count_advcl(parsed_text):
    advcl = count_dependent_sent_text(parsed_text)[2]
    return advcl

In [131]:
#count_acl(parsed_text)

In [133]:
#count_acl_relcl(parsed_text)

In [134]:
#count_advcl(parsed_text)

### 4. Количество предложений

In [135]:
def count_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    return len(sent_lst)

In [136]:
#count_sent(parsed_text)

### 5. Количество клауз

In [137]:
def parsing_things(string):
    token = re.search('[0-9]+\t(.+?)\t', string).group(1)
    order = re.search('([0-9]+)\t', string).group(1)
    head = re.search('\t([0-9]+)\t', string).group(1)
    rel_type = re.search('\t[0-9]+\t(.+?)\t', string).group(1)
    pos = re.search('[0-9]+\t.+?\t.+?\t(.+?)\t', string).group(1)
    #grammar = re.search('[VERB|AUX]\t.+?\t(.+?)\t', every_str).group(1)
    return order, token, head, rel_type, pos

#### Функция, которая возвращает словарь, где ключи - номера предложений, а значения - количество клауз.

***There was a woman next door, and she was a singer.*** - 2 T-units, 2 clauses

***There was a woman next door who was a singer.*** - 1 T-units, 2 clauses

***But while they were trying they killed a whale and used the oil for the lamps.*** - 2 clauses

In [138]:
def count_clauses_every_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    verb_cl = {}
    all_num_sent = count_sent(parsed_text)
    for sent in range(1, all_num_sent+1):
        verb_cl[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if ('VerbForm=Fin' in every_str):
                sent_id = str(num_sent)
                order, token, head, rel_type, pos = parsing_things(every_str)
                if head not in verb_cl[int(sent_id)] and rel_type != 'conj':
                    verb_cl[int(sent_id)].append([order, head])
        num_sent += 1
    for key, value in verb_cl.items():
        if verb_cl[key] == []:
            verb_cl[key] = [None]
        verb_cl[key] = len(verb_cl[key])
    return verb_cl

In [139]:
def count_clauses(parsed_text):
    verb_cl = count_clauses_every_sent(parsed_text)
    num_cl = 0
    #print(verb_cl)
    for key, value in verb_cl.items():
        num_cl = num_cl + value
    return num_cl

In [140]:
#count_clauses(parsed_text)

### 6. Количество T-юнитов

In [141]:
def find_subjects(sentence):
    lst_str = sentence.split('\n')
    maybe_depends = []
    for every_str in lst_str:
        finding = re.search('PRON|NOUN', every_str)
        if '_\t_\t_\t_\t_' not in every_str:
            order, token, head, rel_type, pos = parsing_things(every_str)
            if finding is not None:
                maybe_depends.append(rel_type)
    return maybe_depends

In [142]:
def count_tunits_every_sent(parsed_text):
    verb_cl = count_clauses_every_sent(parsed_text)
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    for key, value in verb_cl.items():
        subjects = find_subjects(sent_lst[key-1])
        acl_relcl = subjects.count('acl:relcl')
        acl = subjects.count('acl')
        advcl = subjects.count('advcl')
        depends = acl_relcl + acl + advcl
        verb_cl[key] = verb_cl[key]-depends
    return verb_cl

In [143]:
def count_tunits(parsed_text):
    verb_cl = count_tunits_every_sent(parsed_text)
    num_t = 0
    #print(verb_cl)
    for key, value in verb_cl.items():
        num_t = num_t + value
    return num_t

In [144]:
#count_tunits(parsed_text)

### 7. Количество сложных T-юнитов

In [145]:
def count_complex_tunit(parsed_text):
    return count_clauses(parsed_text)-count_tunits(parsed_text)

In [146]:
#count_complex_tunit(parsed_text)

### 8. Количество сочинительных фраз

#### Функция возвращает все сочинительные союзы (их вершины): ключи - номера предложений, значения - массив из вершин союзов.

In [147]:
def find_all_coord(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    cp = {}
    for sent in range(1, all_num_sent+1):
        cp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if '\tcc\t' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                cp[int(num_sent)].append(head)
        num_sent += 1
    return cp

#### Функция возвращает cловарь: ключи - номера предложений, значение - количество сочинительных фраз.

In [148]:
def final_coord(parsed_text):
    cp = find_all_coord(parsed_text)
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    num_sent = 1
    cp_final = {}
    for sent in range(1, all_num_sent+1):
        cp_final[sent] = 0
    for every_sent in sent_lst:
        #print('НОМЕР ПРЕДЛОЖЕНИЯ: ' + str(num_sent))
        for every_coord_cp in cp[num_sent]:
            #print('ВЕРШИНА СОЮЗА: ' + every_coord_cp)
            finding2 = re.search('('+every_coord_cp+'\t.+?\tconj.+?)\n', every_sent)
            if finding2 is None:
                continue
            else:
                order2, token2, head2, rel_type2, pos2 = parsing_things(finding2.group(0))
                #print('2 СВЯЗУЮЩИЙ: ' + token2 + pos2)
                finding1 = re.search('('+head2+'\t.+?\t[a-z]+.+?)\n', every_sent)
                order1, token1, head1, rel_type1, pos1 = parsing_things(finding1.group(0))
                #print('1 СВЯЗУЮЩИЙ: ' + token1+pos1)
                if pos2 == pos1:
                    cp_final[num_sent] += 1 
        num_sent += 1
    return cp_final

In [149]:
def count_coord(parsed_text):
    cp_final = final_coord(parsed_text)
    num_cp = 0
    for key, value in cp_final.items():
        num_cp = num_cp + value
    return num_cp

In [150]:
#count_coord(parsed_text)

### 9. Количество сложных именных групп

Complex nominals comprise (i) nouns plus adjective, possessive, prepositional phrase, relative clause, participle, or appositive, (ii) nominal clauses, and (iii) gerunds and infinitives in subject position (Cooper 1976)

#### Possessive

In [151]:
def find_possesive(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    poss = {}
    for sent in range(1, all_num_sent+1):
        poss[sent] = 0
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'nmod' in every_str:# включая nmod:poss
                order, token, head, rel_type, pos = parsing_things(every_str)
                poss[num_sent] = poss[num_sent] + 1
        num_sent += 1
    return poss

#### Prepositional phrase

In [152]:
def find_prep(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    poss = {}
    for sent in range(1, all_num_sent+1):
        poss[sent] = 0
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'ADP' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                poss[num_sent] = poss[num_sent] + 1
        num_sent += 1
    return poss

#### Nouns plus adjective

In [153]:
def find_nouns(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'NOUN' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(order)
        num_sent += 1
    return vp

In [154]:
def find_adjs(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'ADJ' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(head)
        num_sent += 1
    return vp

In [155]:
def adj_noun(parsed_text):
    adjs = find_adjs(parsed_text)
    nouns = find_nouns(parsed_text)
    adj_noun = {}
    for key in adjs:
        adj_noun[key] = len([i for i, j in zip(adjs[key], nouns[key]) if i == j])
    return adj_noun

#### Gerunds and infinitives in subject position

In [156]:
def find_ger_inf(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    poss = {}
    for sent in range(1, all_num_sent+1):
        poss[sent] = 0
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'VerbForm=Ger' in every_str or 'VerbForm=Inf' in every_str and 'xcomp' not in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                poss[num_sent] = poss[num_sent] + 1
        num_sent += 1
    return poss

#### Nouns plus participle

In [157]:
def find_parts(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'VerbForm=Part' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(head)
        num_sent += 1
    return vp

In [158]:
def parts_noun(parsed_text):
    parts = find_parts(parsed_text)
    nouns = find_nouns(parsed_text)
    adj_noun = {}
    for key in parts:
        adj_noun[key] = len([i for i, j in zip(parts[key], nouns[key]) if i == j])
    return adj_noun

In [159]:
def count(d):
    num = 0
    for key, value in d.items():
        num = num + value
    return num

In [160]:
def count_np(parsed_text):
    poss = count(find_possesive(parsed_text))
    prep_ph = count(find_prep(parsed_text))
    adj_n = count(adj_noun(parsed_text))
    ger_inf = count(find_ger_inf(parsed_text))
    part_n = count(parts_noun(parsed_text))
    return poss, prep_ph, adj_n, ger_inf, part_n

In [162]:
#count_np(parsed_text)

### 10. Количество глагольных групп

In [163]:
def find_verbs(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'VERB' in every_str or 'AUX' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(order)
        num_sent += 1
    return vp

In [164]:
def find_vp(parsed_text):
    vp = find_verbs(parsed_text)
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    num_sent = 1
    vp_final = {}
    for sent in range(1, all_num_sent+1):
        vp_final[sent] = 0
    for every_sent in sent_lst:
        #print('НОМЕР ПРЕДЛОЖЕНИЯ: ' + str(num_sent))
        for every_vp in vp[num_sent]:
            #print(every_vp)
            finding_dep = re.findall('\t'+every_vp+'\t.+?\t', every_sent)
            finding_dep_2 = []
            for every_finding_dep in finding_dep:
                find_del = re.search('mark|nsubj|punct', every_finding_dep)
                if find_del is None:
                    finding_dep_2.append(every_finding_dep)
            if finding_dep_2 != []:
                vp_final[num_sent] += 1 
        num_sent += 1
    return vp_final

In [165]:
def count_vp(parsed_text):
    vp_final = find_vp(parsed_text)
    num_vp = 0
    for key, value in vp_final.items():
        num_vp = num_vp + value
    return num_vp

In [167]:
#count_vp(parsed_text)

### 11. Синтаксическая схожесть (части речи, леммы): среднее

In [168]:
def levenshtein(seq1, seq2):  
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [169]:
def pos_lemma(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    d = {}
    for x in range(1, len(sent_lst)+1):
        d[x] = [[], []]
    #print(d[1][0])
    i = 1
    for sent in sent_lst:
        lines = sent.split('\n')
        for line in lines:
            #print(line)
            pos = re.search('.+?\t.+?\t.+?\t(.+?)\t', line)
            lemma = re.search('.+?\t.+?\t(.+?)\t', line)
            if pos != None and lemma != None:
                d[i][0].append(pos.group(1))
                d[i][1].append(lemma.group(1))
            #print(pos)
        i += 1
    return d

In [170]:
def simularity(parsed_text):
    d = pos_lemma(parsed_text)
    #print(len(d))
    dd = {}
    for x in range(1, len(d)+1):
        dd[x] = [[], []]
    i = 1
    for key in d:
        for key2 in d:
            #print(levenshtein(d[key][0], d[key2][0]))
            if i != key2:
                dd[i][0].append(levenshtein(d[key][0], d[key2][0]))
                dd[i][1].append(levenshtein(d[key][1], d[key2][1]))
        i += 1
    #print(dd)
    for every in dd:
        dd[every][0] = mean(dd[every][0])
        dd[every][1] = mean(dd[every][1])
    return dd

In [171]:
def simularity2(parsed_text):
    d = pos_lemma(parsed_text)
    #print(len(d))
    dd = {}
    for x in range(1, len(d)+1):
        dd[x] = [[], []]
    i = 1
    #print(dd)
    for key in d:
        #print(key)
        if i + 1 <= len(d):
            dd[i][0].append(levenshtein(d[key][0], d[key+1][0]))
            dd[i][1].append(levenshtein(d[key][1], d[key+1][1]))
        i += 1
    #print(dd)
    return dd

In [172]:
def pos_sim_mean(parsed_text):
    sim = simularity(parsed_text)
    pos_min = []
    for sent in sim:
        pos_min.append(sim[sent][0])
    return round(mean(pos_min), 2) 

In [173]:
def lemma_sim_mean(parsed_text):
    sim = simularity(parsed_text)
    lemma_min = []
    #print(sim)
    for sent in sim:
        lemma_min.append(sim[sent][1])
    return round(mean(lemma_min), 2) 

In [174]:
def pos_sim_mean2(parsed_text):
    sim = simularity2(parsed_text)
    pos_min = []
    for sent in sim:
        try:
            #print(sim[sent][0][0])
            pos_min.append(sim[sent][0][0])
        except:
            break
    return round(mean(pos_min), 2)

In [175]:
def lemma_sim_mean2(parsed_text):
    sim = simularity2(parsed_text)
    lemma_min = []
    for sent in sim:
        try:
            #print(sim[sent][1][0])
            lemma_min.append(sim[sent][1][0])
        except:
            break
    return round(mean(lemma_min), 2)

In [176]:
#lemma_sim_mean2(parsed_text)

### 12. Среднее количество токенов перед корнем предложения

In [177]:
def tokens_befor_root(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    length = []
    for sent in sent_lst:
        lines = sent.split('\n')
        i = 0
        for line in lines:
            rel_type = re.search('.+?\t.+?\t.+?\t.+?\t.+?\t.+?\t.+?\t(.+?)\t', line)
            if rel_type is not None:
                if rel_type.group(1) == 'root':
                    break
            i += 1
        length.append(i)
    return round(mean(length), 2)

In [178]:
#tokens_befor_root(parsed_text)

### 13. Средняя длина предложения

In [179]:
def mean_len_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    length = []
    for sent in sent_lst:
        lines = sent.split('\n')
        i = 0
        for line in lines:
            pos = re.search('.+?\t.+?\t.+?\t(.+?)\t', line)
            if pos is not None:
                if pos.group(1) != 'PUNCT':
                    i += 1
        length.append(i)
    return round(mean(length), 2)

In [180]:
#mean_len_sent(parsed_text)

### 13. NOUN + INF

In [190]:
def find_nouns(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'NOUN' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(order)
        num_sent += 1
    return vp

In [191]:
find_nouns(parsed_text)

{1: ['3', '7', '9', '17', '28', '33'],
 2: ['3', '6'],
 3: ['9', '11'],
 4: ['11', '18', '25'],
 5: ['5', '13', '28', '35', '36'],
 6: ['8', '11', '20', '27', '31', '32'],
 7: ['6', '10', '12'],
 8: ['3', '5', '6', '8', '18', '25', '27', '30'],
 9: ['2', '5', '8', '10', '12'],
 10: ['5', '6', '10', '18', '22', '28', '31', '35', '44'],
 11: ['4', '15', '19', '20', '24', '27', '32', '34']}

In [194]:
def find_inf(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    nouns_d = find_nouns(parsed_text)
    k = 0
    i = 0
    for every_sent in nouns_d:
        for every_num in nouns_d[every_sent]:
            num_next = int(every_num) + 1
            finding = re.search ('\n' + str(num_next) + '\t.+?\tto\t', sent_lst[i])
            if finding != None:
                k += 1
        i += 1
    return k

In [193]:
intfind_inf(parsed_text)/count_sent(parsed_text)

1


TypeError: unsupported operand type(s) for /: 'NoneType' and 'int'

In [189]:
parsed_text = get_parsed_text('english-partut-ud-2.0-170801.udpipe', '/Users/irene/Desktop/Курсовая/esseys/AAl_13_1.txt')

In [188]:
#parsed_text