# Критерии syntactic complexity

### 1. Количество слов

In [210]:
from model import Model
import re
import copy
import numpy as np

#### Функция, которая добавляет пробелы после .?!

In [170]:
def space(string):
    string = re.sub('([a-zA-Z]| )([\.\?!])', '\\1\\2 ', string)
    string = re.sub('  +', ' ', string)
    return string

#### Функция, которая парсит текст udpipe. На выходе получается строка с форматом 'conllu'.

In [171]:
def get_parsed_text(model_name, text_path):
    model = Model(model_name)
    text = open(text_path, 'r')
    text = text.read()
    text = space(text)
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    output = model.write(sentences, "conllu")
    return output

#### Функция, которая считает токены в тексте. Не считаются знаки препинания.

In [172]:
def count_tokens(parsed_text):
    num_tokens = 0
    lst_str = parsed_text.split('\n')
    for every_str in lst_str:
        #print(every_str)
        if ('PUNCT' not in every_str) and every_str.startswith('#') == False:
            #print(every_str)
            num_tokens += 1
    return num_tokens

In [173]:
parsed_text = get_parsed_text('english-partut-ud-2.0-170801.udpipe', '/Users/irene/Downloads/exam2014/AAl_1_1.txt')

#### Функция, которая считает глубину дерева.

In [174]:
def order_head(parsed_sent):
    sent_lst = parsed_sent.split('\n')
    #print(sent_lst[0])
    order_head_lst = []
    for token in sent_lst:
        token = re.sub(r'\|', '$', token)
        order = re.search('([0-9]+)\t', token).group(1)
        #print(order)
        head = re.search('.+\t.+\t([0-9]+)', token).group(1)
        #print(head)
        token = re.search('^[0-9]+\t(.+?)\t', token).group(1)
        order_head_lst.append((int(order), int(head), token))
    return order_head_lst

In [175]:
def find_root(order_head_lst):
    for every_order_head in order_head_lst:
        if every_order_head[1] == 0:
            root = every_order_head
    return root

In [176]:
def root_children(parsed_sent):
    order_head_lst = order_head(parsed_sent)
    #print(order_head_lst)
    root = find_root(order_head_lst)
    chains = []
    for every_order_head in order_head_lst:
        if every_order_head[1] == root[0]:
            chains.append([root[0], every_order_head[0]])
    return chains, order_head_lst

In [200]:
def chains_heads(chains, order_head_lst):
    length_chains = len(chains)
    i = 0
    for chain in chains:
        if i < length_chains:
            heads = []
            if 'stop' not in chain:
                for order_head in order_head_lst:
                    if chain[-1] == order_head[1]:
                        heads.append(order_head[0])
                if heads == [] and 'stop' not in chain:
                    chain.append('stop')
                else:
                    ind_head = 0
                    for head in heads:
                        new_chain = copy.copy(chain)[:-1]
                        if ind_head == 0:
                            chain.append(head)
                            ind_head += 1
                        else:
                            new_chain.append(head)
                            chains.append(new_chain)
        i += 1
    while all(item[-1] == 'stop' for item in chains) is False:
        count_depth_for_one_sent(chains, order_head_lst)
    return chains

In [204]:
def count_max_depth_for_one_sent(sent):
    chains, order_head_lst = root_children(sent)
    chains = chains_heads(chains, order_head_lst)
    depths = []
    #print(chains)
    for chain in chains:
        depths.append(len(chain)-2)
    return max(depths)

In [213]:
def count_depths_for_one_text(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    max_depths = []
    for sent in sent_lst:
        max_depths.append(count_max_depth_for_one_sent(sent))
    return max_depths

In [217]:
def av_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return np.mean(max_depths)

In [220]:
def max_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return np.max(max_depths)

In [221]:
def min_depth_for_one_text(parsed_text):
    max_depths = count_depths_for_one_text(parsed_text)
    return np.min(max_depths)

In [218]:
count_depths_for_one_text(parsed_text)

[6, 5, 4, 6, 5, 3, 3, 4, 5, 3, 4]

In [222]:
av_depth_for_one_text(parsed_text)

4.3636363636363633

In [223]:
max_depth_for_one_text(parsed_text)

6

In [224]:
min_depth_for_one_text(parsed_text)

3

#### Количество зависимых клауз: acl, acl:relcl, advcl

In [240]:
# Возвращает словарь, где ключ - номер предложения, значения - массив [кол-во acl, кол-во acl:relcl, кол-во advcl]
def count_dependent_sent(parsed_text):
    sent_lst = re.findall('(1\t.+?)\n\n', parsed_text, re.DOTALL)
    num_sent = 1
    d_sent = {}
    for sent in sent_lst:
        acl = len(re.findall('\t(acl)\t',sent))
        relcl = len(re.findall('\t(acl:relcl)\t',sent))
        advcl = len(re.findall('\t(advcl)\t',sent))
        d_sent[num_sent] = [acl, relcl, advcl]
        num_sent += 1
    return d_sent

In [241]:
count_dependent_sent(parsed_text)

{1: [2, 0, 0],
 2: [0, 0, 1],
 3: [0, 0, 0],
 4: [0, 1, 0],
 5: [1, 0, 0],
 6: [0, 0, 0],
 7: [0, 0, 0],
 8: [0, 0, 1],
 9: [1, 0, 0],
 10: [0, 0, 0],
 11: [0, 0, 1]}