# Критерии syntactic complexity

### 1. Количество слов

In [2]:
from model import Model
import re
import copy
import numpy as np
from statistics import mean

#### Функция, которая добавляет пробелы после .?!

In [3]:
def space(string):
    string = re.sub('([a-zA-Z]| )([\.\?!])', '\\1\\2 ', string)
    string = re.sub(': ', ' : ', string)
    string = re.sub('; ', ' ; ', string)
    string = re.sub('  +', ' ', string)
    return string

#### Функция, которая парсит текст udpipe. На выходе получается строка с форматом 'conllu'.

In [4]:
def get_parsed_text(model_name, text_path):
    model = Model(model_name)
    text = open(text_path, 'r')
    text = text.read()
    text = check_spelling(text)
    text = space(text)
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    output = model.write(sentences, "conllu")
    return output

#### Функция, которая считает токены в тексте. Не считаются знаки препинания.

In [5]:
def count_tokens(parsed_text):
    num_tokens = 0
    lst_str = parsed_text.split('\n')
    for every_str in lst_str:
        #print(every_str)
        if ('PUNCT' not in every_str) and (every_str.startswith('#') == False) and (every_str != ''):
            #print(every_str)
            num_tokens += 1
            #print(num_tokens)
    return num_tokens

In [6]:
#count_tokens(parsed_text)

### 2. Глубина дерева

#### Функция, которая считает глубину дерева.

In [7]:
def order_head(parsed_sent):
    sent_lst = parsed_sent.split('\n')
    #print(sent_lst[0])
    order_head_lst = []
    for token in sent_lst:
        token = re.sub(r'\|', '$', token)
        if '\t_\t_\t_\t_\t_' not in token:
            order = re.search('([0-9]+)\t', token).group(1)
        #print(order)
            head = re.search('.+\t.+\t([0-9]+)', token).group(1)
        #print(head)
            token = re.search('^[0-9]+\t(.+?)\t', token).group(1)
            order_head_lst.append((int(order), int(head), token))
    return order_head_lst

In [8]:
def find_root(order_head_lst):
    for every_order_head in order_head_lst:
        if every_order_head[1] == 0:
            root = every_order_head
    return root

In [9]:
def root_children(parsed_sent):
    order_head_lst = order_head(parsed_sent)
    #print(order_head_lst)
    root = find_root(order_head_lst)
    chains = []
    for every_order_head in order_head_lst:
        if every_order_head[1] == root[0]:
            chains.append([root[0], every_order_head[0]])
    return chains, order_head_lst

In [120]:
def chains_heads(chains, order_head_lst):
    length_chains = len(chains)
    i = 0
    for chain in chains:
        if i < length_chains:
            heads = []
            if 'stop' not in chain:
                for order_head in order_head_lst:
                    if chain[-1] == order_head[1]:
                        heads.append(order_head[0])
                if heads == [] and 'stop' not in chain:
                    chain.append('stop')
                else:
                    ind_head = 0
                    for head in heads:
                        new_chain = copy.copy(chain)[:-1]
                        if ind_head == 0:
                            chain.append(head)
                            ind_head += 1
                        else:
                            new_chain.append(head)
                            chains.append(new_chain)
        i += 1
    while all(item[-1] == 'stop' for item in chains) is False:
        chains_heads(chains, order_head_lst)
    return chains

In [121]:
def count_max_depth_for_one_sent(sent):
    chains, order_head_lst = root_children(sent)
    chains = chains_heads(chains, order_head_lst)
    depths = []
    #print(chains)
    for chain in chains:
        depths.append(len(chain)-2)
    if depths != []:
        return max(depths)
    else:
        return None

In [127]:
def count_depths_for_one_text(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    max_depths = []
    for sent in sent_lst:
        max_depths.append(count_max_depth_for_one_sent(sent))
    return [x for x in max_depths if x != None ]

In [128]:
def av_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return round(np.mean(max_depths), 2)

In [129]:
def max_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return round(np.max(max_depths),2)

In [130]:
def min_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return round(np.min(max_depths), 2)

In [131]:
with open('/Users/irene/Desktop/Диплом/new_data/116.txt', 'r') as file:
    text = file.read()
parser = ParserUDpipe(text)
parsed_text = parser.parsing()

In [132]:
#parsed_text

In [133]:
av_depth_for_one_text(parsed_text)

3.78

In [17]:
#max_depth_for_one_text(parsed_text)

In [18]:
#min_depth_for_one_text(parsed_text)

### 3. Количество зависимых клауз: acl, acl:relcl, advcl

In [19]:
# Возвращает словарь, где ключ - номер предложения, значения - массив [кол-во acl, кол-во acl:relcl, кол-во advcl]
def count_dependent_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    num_sent = 1
    d_sent = {}
    for sent in sent_lst:
        acl = len(re.findall('\t(acl)\t',sent))
        relcl = len(re.findall('\t(acl:relcl)\t',sent))
        advcl = len(re.findall('\t(advcl)\t',sent))
        d_sent[num_sent] = [acl, relcl, advcl]
        num_sent += 1
    return d_sent

In [20]:
def count_dependent_sent_text(parsed_text):
    d_sent = count_dependent_sent(parsed_text)
    acl = 0
    rel_cl = 0
    advcl = 0
    for sent in d_sent:
        acl = acl + d_sent[sent][0]
        rel_cl = rel_cl + d_sent[sent][1]
        advcl = advcl + d_sent[sent][2]
    return acl, rel_cl, advcl

In [21]:
def count_acl(parsed_text):
    acl = count_dependent_sent_text(parsed_text)[0]
    return acl

In [22]:
def count_acl_relcl(parsed_text):
    acl_relcl = count_dependent_sent_text(parsed_text)[1]
    return acl_relcl

In [23]:
def count_advcl(parsed_text):
    advcl = count_dependent_sent_text(parsed_text)[2]
    return advcl

In [24]:
#count_acl(parsed_text)

In [25]:
#count_acl_relcl(parsed_text)

In [26]:
#count_advcl(parsed_text)

### 4. Количество предложений

In [27]:
def count_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    return len(sent_lst)

In [28]:
#count_sent(parsed_text)

### 5. Количество клауз

In [29]:
def parsing_things(string):
    token = re.search('[0-9]+\t(.+?)\t', string).group(1)
    order = re.search('([0-9]+)\t', string).group(1)
    head = re.search('\t([0-9]+)\t', string).group(1)
    rel_type = re.search('\t[0-9]+\t(.+?)\t', string).group(1)
    pos = re.search('[0-9]+\t.+?\t.+?\t(.+?)\t', string).group(1)
    #grammar = re.search('[VERB|AUX]\t.+?\t(.+?)\t', every_str).group(1)
    return order, token, head, rel_type, pos

#### Функция, которая возвращает словарь, где ключи - номера предложений, а значения - количество клауз.

***There was a woman next door, and she was a singer.*** - 2 T-units, 2 clauses

***There was a woman next door who was a singer.*** - 1 T-units, 2 clauses

***But while they were trying they killed a whale and used the oil for the lamps.*** - 2 clauses

In [30]:
def count_clauses_every_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    verb_cl = {}
    all_num_sent = count_sent(parsed_text)
    for sent in range(1, all_num_sent+1):
        verb_cl[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if ('VerbForm=Fin' in every_str):
                sent_id = str(num_sent)
                order, token, head, rel_type, pos = parsing_things(every_str)
                if head not in verb_cl[int(sent_id)] and rel_type != 'conj':
                    verb_cl[int(sent_id)].append([order, head])
        num_sent += 1
    for key, value in verb_cl.items():
        if verb_cl[key] == []:
            verb_cl[key] = [None]
        verb_cl[key] = len(verb_cl[key])
    return verb_cl

In [31]:
def count_clauses(parsed_text):
    verb_cl = count_clauses_every_sent(parsed_text)
    num_cl = 0
    #print(verb_cl)
    for key, value in verb_cl.items():
        num_cl = num_cl + value
    return num_cl

In [32]:
#count_clauses(parsed_text)

### 6. Количество T-юнитов

In [33]:
def find_subjects(sentence):
    lst_str = sentence.split('\n')
    maybe_depends = []
    for every_str in lst_str:
        finding = re.search('PRON|NOUN', every_str)
        if '_\t_\t_\t_\t_' not in every_str:
            order, token, head, rel_type, pos = parsing_things(every_str)
            if finding is not None:
                maybe_depends.append(rel_type)
    return maybe_depends

In [34]:
def count_tunits_every_sent(parsed_text):
    verb_cl = count_clauses_every_sent(parsed_text)
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    for key, value in verb_cl.items():
        subjects = find_subjects(sent_lst[key-1])
        acl_relcl = subjects.count('acl:relcl')
        acl = subjects.count('acl')
        advcl = subjects.count('advcl')
        depends = acl_relcl + acl + advcl
        verb_cl[key] = verb_cl[key]-depends
    return verb_cl

In [35]:
def count_tunits(parsed_text):
    verb_cl = count_tunits_every_sent(parsed_text)
    num_t = 0
    #print(verb_cl)
    for key, value in verb_cl.items():
        num_t = num_t + value
    return num_t

In [36]:
#count_tunits(parsed_text)

### 7. Количество сложных T-юнитов

In [37]:
def count_complex_tunit(parsed_text):
    return count_clauses(parsed_text)-count_tunits(parsed_text)

In [38]:
#count_complex_tunit(parsed_text)

### 8. Количество сочинительных фраз

#### Функция возвращает все сочинительные союзы (их вершины): ключи - номера предложений, значения - массив из вершин союзов.

In [39]:
def find_all_coord(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    cp = {}
    for sent in range(1, all_num_sent+1):
        cp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if '\tcc\t' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                cp[int(num_sent)].append(head)
        num_sent += 1
    return cp

#### Функция возвращает cловарь: ключи - номера предложений, значение - количество сочинительных фраз.

In [40]:
def final_coord(parsed_text):
    cp = find_all_coord(parsed_text)
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    num_sent = 1
    cp_final = {}
    for sent in range(1, all_num_sent+1):
        cp_final[sent] = 0
    for every_sent in sent_lst:
        #print('НОМЕР ПРЕДЛОЖЕНИЯ: ' + str(num_sent))
        for every_coord_cp in cp[num_sent]:
            #print('ВЕРШИНА СОЮЗА: ' + every_coord_cp)
            finding2 = re.search('('+every_coord_cp+'\t.+?\tconj.+?)\n', every_sent)
            if finding2 is None:
                continue
            else:
                order2, token2, head2, rel_type2, pos2 = parsing_things(finding2.group(0))
                #print('2 СВЯЗУЮЩИЙ: ' + token2 + pos2)
                finding1 = re.search('('+head2+'\t.+?\t[a-z]+.+?)\n', every_sent)
                order1, token1, head1, rel_type1, pos1 = parsing_things(finding1.group(0))
                #print('1 СВЯЗУЮЩИЙ: ' + token1+pos1)
                if pos2 == pos1:
                    cp_final[num_sent] += 1 
        num_sent += 1
    return cp_final

In [41]:
def count_coord(parsed_text):
    cp_final = final_coord(parsed_text)
    num_cp = 0
    for key, value in cp_final.items():
        num_cp = num_cp + value
    return num_cp

In [42]:
#count_coord(parsed_text)

### 9. Количество сложных именных групп

Complex nominals comprise (i) nouns plus adjective, possessive, prepositional phrase, relative clause, participle, or appositive, (ii) nominal clauses, and (iii) gerunds and infinitives in subject position (Cooper 1976)

#### Possessive

In [43]:
def find_possesive(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    poss = {}
    for sent in range(1, all_num_sent+1):
        poss[sent] = 0
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'nmod' in every_str:# включая nmod:poss
                order, token, head, rel_type, pos = parsing_things(every_str)
                poss[num_sent] = poss[num_sent] + 1
        num_sent += 1
    return poss

#### Prepositional phrase

In [44]:
def find_prep(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    poss = {}
    for sent in range(1, all_num_sent+1):
        poss[sent] = 0
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'ADP' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                poss[num_sent] = poss[num_sent] + 1
        num_sent += 1
    return poss

#### Nouns plus adjective

In [45]:
def find_nouns(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'NOUN' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(order)
        num_sent += 1
    return vp

In [46]:
def find_adjs(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'ADJ' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(head)
        num_sent += 1
    return vp

In [47]:
def adj_noun(parsed_text):
    adjs = find_adjs(parsed_text)
    nouns = find_nouns(parsed_text)
    adj_noun = {}
    for key in adjs:
        adj_noun[key] = len([i for i, j in zip(adjs[key], nouns[key]) if i == j])
    return adj_noun

#### Gerunds and infinitives in subject position

In [48]:
def find_ger_inf(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    poss = {}
    for sent in range(1, all_num_sent+1):
        poss[sent] = 0
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'VerbForm=Ger' in every_str or 'VerbForm=Inf' in every_str and 'xcomp' not in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                poss[num_sent] = poss[num_sent] + 1
        num_sent += 1
    return poss

#### Nouns plus participle

In [49]:
def find_parts(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'VerbForm=Part' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(head)
        num_sent += 1
    return vp

In [50]:
def parts_noun(parsed_text):
    parts = find_parts(parsed_text)
    nouns = find_nouns(parsed_text)
    adj_noun = {}
    for key in parts:
        adj_noun[key] = len([i for i, j in zip(parts[key], nouns[key]) if i == j])
    return adj_noun

In [51]:
def count(d):
    num = 0
    for key, value in d.items():
        num = num + value
    return num

In [52]:
def count_np(parsed_text):
    poss = count(find_possesive(parsed_text))
    prep_ph = count(find_prep(parsed_text))
    adj_n = count(adj_noun(parsed_text))
    ger_inf = count(find_ger_inf(parsed_text))
    part_n = count(parts_noun(parsed_text))
    return poss, prep_ph, adj_n, ger_inf, part_n

In [53]:
#count_np(parsed_text)

### 10. Количество глагольных групп

In [54]:
def find_verbs(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'VERB' in every_str or 'AUX' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(order)
        num_sent += 1
    return vp

In [55]:
def find_vp(parsed_text):
    vp = find_verbs(parsed_text)
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    num_sent = 1
    vp_final = {}
    for sent in range(1, all_num_sent+1):
        vp_final[sent] = 0
    for every_sent in sent_lst:
        #print('НОМЕР ПРЕДЛОЖЕНИЯ: ' + str(num_sent))
        for every_vp in vp[num_sent]:
            #print(every_vp)
            finding_dep = re.findall('\t'+every_vp+'\t.+?\t', every_sent)
            finding_dep_2 = []
            for every_finding_dep in finding_dep:
                find_del = re.search('mark|nsubj|punct', every_finding_dep)
                if find_del is None:
                    finding_dep_2.append(every_finding_dep)
            if finding_dep_2 != []:
                vp_final[num_sent] += 1 
        num_sent += 1
    return vp_final

In [56]:
def count_vp(parsed_text):
    vp_final = find_vp(parsed_text)
    num_vp = 0
    for key, value in vp_final.items():
        num_vp = num_vp + value
    return num_vp

In [57]:
#count_vp(parsed_text)

### 11. Синтаксическая схожесть (части речи, леммы): среднее

In [58]:
def levenshtein(seq1, seq2):  
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [59]:
def pos_lemma(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    d = {}
    for x in range(1, len(sent_lst)+1):
        d[x] = [[], []]
    #print(d[1][0])
    i = 1
    for sent in sent_lst:
        lines = sent.split('\n')
        for line in lines:
            #print(line)
            pos = re.search('.+?\t.+?\t.+?\t(.+?)\t', line)
            lemma = re.search('.+?\t.+?\t(.+?)\t', line)
            if pos != None and lemma != None:
                d[i][0].append(pos.group(1))
                d[i][1].append(lemma.group(1))
            #print(pos)
        i += 1
    return d

In [60]:
def simularity(parsed_text):
    d = pos_lemma(parsed_text)
    #print(len(d))
    dd = {}
    for x in range(1, len(d)+1):
        dd[x] = [[], []]
    i = 1
    for key in d:
        for key2 in d:
            #print(levenshtein(d[key][0], d[key2][0]))
            if i != key2:
                dd[i][0].append(levenshtein(d[key][0], d[key2][0]))
                dd[i][1].append(levenshtein(d[key][1], d[key2][1]))
        i += 1
    #print(dd)
    for every in dd:
        dd[every][0] = mean(dd[every][0])
        dd[every][1] = mean(dd[every][1])
    return dd

In [61]:
def simularity2(parsed_text):
    d = pos_lemma(parsed_text)
    #print(len(d))
    dd = {}
    for x in range(1, len(d)+1):
        dd[x] = [[], []]
    i = 1
    #print(dd)
    for key in d:
        #print(key)
        if i + 1 <= len(d):
            dd[i][0].append(levenshtein(d[key][0], d[key+1][0]))
            dd[i][1].append(levenshtein(d[key][1], d[key+1][1]))
        i += 1
    #print(dd)
    return dd

In [62]:
def pos_sim_mean(parsed_text):
    sim = simularity(parsed_text)
    pos_min = []
    for sent in sim:
        pos_min.append(sim[sent][0])
    return round(mean(pos_min), 2) 

In [63]:
def lemma_sim_mean(parsed_text):
    sim = simularity(parsed_text)
    lemma_min = []
    #print(sim)
    for sent in sim:
        lemma_min.append(sim[sent][1])
    return round(mean(lemma_min), 2) 

In [64]:
def pos_sim_mean2(parsed_text):
    sim = simularity2(parsed_text)
    pos_min = []
    for sent in sim:
        try:
            #print(sim[sent][0][0])
            pos_min.append(sim[sent][0][0])
        except:
            break
    return round(mean(pos_min), 2)

In [65]:
def lemma_sim_mean2(parsed_text):
    sim = simularity2(parsed_text)
    lemma_min = []
    for sent in sim:
        try:
            #print(sim[sent][1][0])
            lemma_min.append(sim[sent][1][0])
        except:
            break
    return round(mean(lemma_min), 2)

In [66]:
#lemma_sim_mean2(parsed_text)

### 12. Среднее количество токенов перед корнем предложения

In [67]:
def tokens_befor_root(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    length = []
    for sent in sent_lst:
        lines = sent.split('\n')
        i = 0
        for line in lines:
            rel_type = re.search('.+?\t.+?\t.+?\t.+?\t.+?\t.+?\t.+?\t(.+?)\t', line)
            if rel_type is not None:
                if rel_type.group(1) == 'root':
                    break
            i += 1
        length.append(i)
    return round(mean(length), 2)

In [68]:
#tokens_befor_root(parsed_text)

### 13. Средняя длина предложения

In [69]:
def mean_len_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    length = []
    for sent in sent_lst:
        lines = sent.split('\n')
        i = 0
        for line in lines:
            pos = re.search('.+?\t.+?\t.+?\t(.+?)\t', line)
            if pos is not None:
                if pos.group(1) != 'PUNCT':
                    i += 1
        length.append(i)
    return round(mean(length), 2)

In [70]:
#mean_len_sent(parsed_text)

### 13. NOUN + INF

In [71]:
def find_nouns(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    all_num_sent = count_sent(parsed_text)
    vp = {}
    for sent in range(1, all_num_sent+1):
        vp[sent] = []
    num_sent = 1
    for every_sent in sent_lst:
        lst_str = every_sent.split('\n')
        for every_str in lst_str:
            if 'NOUN' in every_str:
                order, token, head, rel_type, pos = parsing_things(every_str)
                vp[num_sent].append(order)
        num_sent += 1
    return vp

In [75]:
#find_nouns(parsed_text)

In [73]:
def find_inf(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    nouns_d = find_nouns(parsed_text)
    k = 0
    i = 0
    for every_sent in nouns_d:
        for every_num in nouns_d[every_sent]:
            num_next = int(every_num) + 1
            finding = re.search ('\n' + str(num_next) + '\t.+?\tto\t', sent_lst[i])
            if finding != None:
                k += 1
        i += 1
    return k

In [76]:
from parsing import ParserUDpipe
parser = ParserUDpipe("They go.")
parsed_text = parser.parsing()

In [77]:
conllu

'# newdoc\n# newpar\n# sent_id = 1\n# text = They go.\n1\tThey\tthey\tPRON\tPE\tNumber=Plur|Person=3|PronType=Prs\t2\tnsubj\t_\t_\n2\tgo\tgo\tVERB\tV\tMood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No\n3\t.\t.\tPUNCT\tFS\t_\t2\tpunct\t_\tSpaceAfter=No\n\n'

In [79]:
import enchant
import enchant.checker
from enchant.checker.CmdLineChecker import CmdLineChecker
from tqdm import tqdm_notebook as tqdm
def check_spelling(text):
    chkr = enchant.checker.SpellChecker("en_GB")
    chkr.set_text(text)
    for err in chkr:
        sug = err.suggest()[0]
        err.replace(sug)
    c = chkr.get_text()
    return c

In [85]:
import pandas as pd
df = pd.read_csv('/Users/irene/Desktop/Диплом/code/result_criteria/result_criteria.csv', delimiter=';')
paths = ['/Users/irene/Desktop/Диплом/new_data/'+str(x)+'.txt' for x in list(df['Essay'])]
#list(df['Essay'])

In [136]:
all_str = ''
for path in tqdm(paths):
    with open(path, 'r') as file:
        text = file.read()
    text = check_spelling(text)
    text = text.replace('\n', ' ')
    parser = ParserUDpipe(text)
    parsed_text = parser.parsing()
    # +++
    num_tokens = count_tokens(parsed_text)# number of tokens
    # +++
    av_depth = av_depth_for_one_text(parsed_text)# average depth of tree
    # +++
    min_depth = min_depth_for_one_text(parsed_text)# minimal depth of tree
    # +++
    max_depth = max_depth_for_one_text(parsed_text)# maximal depth of tree
    # +++
    num_acl = count_acl(parsed_text)# number of acls
    num_acl_relcl = count_acl_relcl(parsed_text)# number of acl:relcls
    num_advcl = count_advcl(parsed_text)# number of advcls
    num_sents = count_sent(parsed_text)# number of sentances
    num_cl = count_clauses(parsed_text)# number of clauses
    num_tu = count_tunits(parsed_text)# number of T-units
    # +++
    num_ctu = count_complex_tunit(parsed_text)# number of complex T-units
    # +++
    num_coord = count_coord(parsed_text)# number of coordinational phrases
    num_np = count_np(parsed_text)# number of nps: possesive constructions, prepositional phrases, 
                                    #adj + nouns, gerund + inf, part + nouns 
    num_vp = count_vp(parsed_text)# number of vps
    # +++
    mean_l_sim = lemma_sim_mean(parsed_text)# mean L. distance (lemmas)
    # +++
    mean_p_sim = pos_sim_mean(parsed_text)# mean L. distance (pos)
    mean_l_sim_nei = lemma_sim_mean2(parsed_text)
    # +++
    mean_p_sim_nei = pos_sim_mean2(parsed_text)
    # +++
    mean_tokens_root = tokens_befor_root(parsed_text)# number of tokens before root
    # +++
    n_inf = find_inf(parsed_text)
    
#--COMPLEX MEASURES--#
    mean_length_s = mean_len_sent(parsed_text)# mean length of sentances
    mean_length_c = num_tokens/num_cl
    c_s = num_cl/num_sents
    c_t = num_cl/num_tu
    acl_t = num_acl/num_tu
    acl_relcl_t = num_acl_relcl/num_tu
    advcl_t = num_advcl/num_tu
    acl_cl = num_acl/num_cl
    acl_relcl_cl = num_acl_relcl/num_cl
    advcl_cl = num_advcl/num_cl
    coord_cl = num_coord/num_cl
    t_s = num_tu/num_sents
    poss_s = num_np[0]/num_sents
    prep_s = num_np[1]/num_sents
    adj_n_s = num_np[2]/num_sents
    ger_inf_s = num_np[3]/num_sents
    part_n_s = num_np[4]/num_sents
    n_inf_s = n_inf/num_sents
    vp_s = num_vp/num_sents

    simple_measures = [num_tokens, num_acl, num_acl_relcl, num_advcl, num_sents, num_cl, num_tu, num_ctu,
                       num_coord, num_np[0], num_np[1], num_np[2], num_np[3], num_np[4], n_inf, num_vp,
                       min_depth, max_depth, sum(num_np)]
    
    measures = [av_depth, mean_l_sim, mean_p_sim, mean_l_sim_nei, mean_p_sim_nei,
                mean_length_s, mean_length_c, c_s, c_t,
                acl_t, acl_relcl_t, advcl_t, acl_cl, acl_relcl_cl, advcl_cl,
                coord_cl, t_s, poss_s, prep_s, adj_n_s, ger_inf_s, part_n_s, n_inf_s,
                vp_s, mean_tokens_root]
    str_one_esse = ''
    for measure in measures:
        str_one_esse += str(round(measure, 3)) + ','
    for m in simple_measures:
        str_one_esse += str(round(m, 4)) + ','
    all_str += str_one_esse + '\n'

A Jupyter Widget

In [89]:
print(path)

/Users/irene/Desktop/Диплом/new_data/116.txt


In [137]:
with open('syn_criteria.csv', 'w') as file:
    file.write(all_str)