This program makes valence models for the relevant verbs on Geometry.
Verbs were collected from the 'getting verbs' program.
Valence models are saved into txt files.

In [3]:
import pymorphy2 as pm2 
pmm = pm2.MorphAnalyzer() 
from pymystem3 import Mystem
m = Mystem()
import re
import os
import json
import copy

In [2]:
from gensim.utils import tokenize
from gensim.summarization.textcleaner import split_sentences
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize, wordpunct_tokenize



In [421]:
with open("only relevant verbs.json", "r", encoding = 'utf-8') as t:
    good_verbs = json.load(t)

In [None]:
def get_sentences(text): ##getting simple sentences from text
    good_sentences = []
    sentences = []
    regSubS = re.compile('[А-Яа-я ]+')
    for line in text:
        if line != ' \n':
            curr_sents = list(split_sentences(line))
            for i in curr_sents:
                sentences.append(i)
                
    for sentence in sentences:
        if len(sentence) > 10:
            for subsent in re.findall(regSubS, sentence):
                if len(subsent.split()) > 1:
                    good_sentences.append(subsent)
    return good_sentences

In [406]:
## getting good sentences
with open('all geometry.txt', 'r', encoding='utf-8') as t: 
    text = t.readlines()

good_sentences = get_sentences(text)

In [407]:
len(good_sentences)

88766

In [384]:
def get_lemmas(sent): ## turning sentence into the list of its lemmas
    sent = re.sub(r'[^\w\s]','', sent) 
    sent = re.sub(r'\d', '', sent) 
    sent = re.sub(r'[A-Za-z]', '', sent)
    lem_sent = [pmm.normal_forms(x)[0] for x in sent.split()]
    return lem_sent

In [434]:
def get_contexts(): ## getting a dict of the verbs with their contexts
    contexts = {}
    for verb in good_verbs:
        if verb != 'объесть':
            contexts[verb] = []
    for sent in good_sentences:
        for verb in good_verbs:
            if verb != 'объесть':
                lem_sent = get_lemmas(sent)
                if verb in lem_sent:
                    contexts[verb].append(sent)
    return contexts

In [None]:
contexts = get_contexts()

In [436]:
len(contexts)

71

In [4]:
def save_to_json(title, variable): ## saving a variable into json a file. title - name of the json file
    with open(title, "w", encoding = 'utf-8') as t:
        json.dump(variable, t)

In [5]:
def get_from_json(title): ## getting a variable from a json file. title - name of the json file
    with open(title, "r", encoding = 'utf-8') as t:
        variable = json.load(t)
    return variable

In [437]:
## saving verbs and their contexts into json file 
save_to_json("contexts.json", contexts)

In [438]:
## getting verbs and their contexts from the json file
contexts = get_from_json("contexts.json")

In [439]:
def get_current_words_for_trans_v(sent): ## getting transitive verbs and their subjects and objects from sentence
    curr_words = []
    ex_pos = None
    prtf_gr = None
    for word in sent.split():
        nomn = False
        w_gr = str(pmm.tag(word)[0])
        pos = w_gr.split()[0].split(',')[0]
        if prtf_gr == None:
            if pos == 'PREP':
                curr_words.append((pmm.normal_forms(word)[0], pos))
            elif pos == 'NOUN':
                case = w_gr.split()[1].split(',')[1]
                if case == 'gent': 
                    if ex_pos == 'PREP':
                        curr_words.append((pmm.normal_forms(word)[0], pos, case))
                else:   
                    if case == 'nomn':
                        if nomn == False:
                            nomn = True
                            curr_words.append((pmm.normal_forms(word)[0], pos, case))
                        else:
                            case = 'accs'
                            curr_words.append((pmm.normal_forms(word)[0], pos, case))
                    else:
                        curr_words.append((pmm.normal_forms(word)[0], pos, case))    
            elif pos == 'PRTF':
                prtf_gr = w_gr.split()[1]
        else:
            if pos == 'NOUN':
                if len(prtf_gr.split(',')) == 3:
                    noun_gr = w_gr.split()[0].split(',')[2] + ',' + w_gr.split()[1]
                else:
                    noun_gr = w_gr.split()[1]
                if noun_gr == prtf_gr:
                    prtf_gr = None
                    case = w_gr.split()[1].split(',')[1]
                    if case == 'gent': 
                        if ex_pos == 'PREP':
                            curr_words.append((pmm.normal_forms(word)[0], pos, case))
                    else:   
                        if case == 'nomn':
                            if nomn == False:
                                nomn = True
                                curr_words.append((pmm.normal_forms(word)[0], pos, case))
                            else:
                                case = 'accs'
                                curr_words.append((pmm.normal_forms(word)[0], pos, case))
                        else:
                            curr_words.append((pmm.normal_forms(word)[0], pos, case)) 
        if pos != 'ADJF' and pos != 'PRTF':
            ex_pos = pos
        else:
            ex_pos = ex_pos
    return curr_words

In [440]:
def get_current_words_for_intr_v(sent): ## getting intransitive verbs and their subjects and objects from sentence
    curr_words = []
    ex_pos = None
    prtf_gr = None
    for word in sent.split():
        w_gr = str(pmm.tag(word)[0])
        pos = w_gr.split()[0].split(',')[0]
        if prtf_gr == None:
            if pos == 'PREP':
                curr_words.append((pmm.normal_forms(word)[0], pos))
            elif pos == 'NOUN':            
                case = w_gr.split()[1].split(',')[1]
                if case != 'accs':
                    if case == 'gent':
                        if ex_pos == 'PREP':
                            curr_words.append((pmm.normal_forms(word)[0], pos, case)) 
                    else:
                        curr_words.append((pmm.normal_forms(word)[0], pos, case)) 
            elif pos == 'PRTF':
                prtf_gr = w_gr.split()[1]               
        else:
            if pos == 'NOUN':
                if len(prtf_gr.split(',')) == 3:
                    noun_gr = w_gr.split()[0].split(',')[2] + ',' + w_gr.split()[1]
                else:
                    noun_gr = w_gr.split()[1]
                if noun_gr == prtf_gr:
                    prtf_gr = None 
                    case = w_gr.split()[1].split(',')[1]
                    if case != 'accs':
                        if case == 'gent':  
                            if ex_pos == 'PREP':
                                curr_words.append((pmm.normal_forms(word)[0], pos, case)) 
                        else:
                            curr_words.append((pmm.normal_forms(word)[0], pos, case))
        if pos != 'ADJF' and pos != 'PRTF':
            ex_pos = pos
        else:
            ex_pos = ex_pos
    return curr_words

In [485]:
def get_valencies(contexts): ## getting all verbs and their subjects and objects for the whole text
    model = {}
    for key, value in contexts.items():
        curr_model = []
        v_gr = str(pmm.tag(key)[0])
        is_tran = v_gr.split()[0].split(',')[2]
        if is_tran == 'tran':
            for sent in value:
                curr_words = get_current_words_for_trans_v(sent)
                curr_model.append(curr_words)
        elif is_tran == 'intr':
            for sent in value:
                curr_words = get_current_words_for_intr_v(sent)
                curr_model.append(curr_words)
        model[key] = curr_model
    return model

In [1]:
def getting_computed_valencies(contexts): 
    ## collects all verbs and their valencies with number of uses in text into one great dict of dicts
    model = get_valencies(contexts)
    computed_models = {}
    for key, value in model.items():
        verb_model = {}
        for sent in range(len(value)):
            for idx, word in enumerate(value[sent]):
                if word[1] == 'PREP':
                    if idx < (len(value[sent]) - 1):
                        if value[sent][idx+1][1] == 'NOUN':
                            n_info = value[sent][idx+1]
                            if n_info[2] == 'nomn':
                                n_info[2] = 'accs'
                            if word[0] not in verb_model:
                                verb_model[word[0] + ' + ' + n_info[2]] = {n_info[0] : 1}
                            else:
                                if n_info[0] in verb_model[word[0] + ' + ' + n_info[2]]:
                                    verb_model[word[0] + ' + ' + n_info[2]][n_info[0]] += 1
                                else:
                                    verb_model[word[0] + ' + ' + n_info[2]][n_info[0]] = 1
                if word[1] == 'NOUN':
                    if idx != 0:
                        if value[sent][idx-1][1] != 'PREP':
                            if word[2] == 'nomn':
                                if 'подлежащее, nomn' not in verb_model:
                                    verb_model['подлежащее, nomn'] = {word[0] : 1}
                                else:
                                    if word[0] in verb_model['подлежащее, nomn']:
                                        verb_model['подлежащее, nomn'][word[0]] += 1
                                    else:
                                        verb_model['подлежащее, nomn'][word[0]] = 1
                            else:
                                v_gr = str(pmm.tag(key)[0])
                                is_tran = v_gr.split()[0].split(',')[2]
                                if is_tran == 'tran':
                                    if word[2] == 'accs': 
                                        if 'прямое дополнение, accs' not in verb_model:
                                            verb_model['прямое дополнение, accs'] = {word[0] : 1}
                                        else:
                                            if word[0] in verb_model['прямое дополнение, accs']:
                                                verb_model['прямое дополнение, accs'][word[0]] += 1
                                            else:
                                                verb_model['прямое дополнение, accs'][word[0]] = 1
                                    else: 
                                        title = 'непрямое дополнение, ' + word[2]
                                        if title not in verb_model:
                                            verb_model[title] = {word[0] : 1}
                                        else:
                                            if word[0] in verb_model[title]:
                                                verb_model[title][word[0]] += 1
                                            else:
                                                verb_model[title][word[0]] = 1
                                else: 
                                    title = 'непрямое дополнение, ' + word[2]
                                    if title not in verb_model:
                                        verb_model[title] = {word[0] : 1}
                                    else:
                                        if word[0] in verb_model[title]:
                                            verb_model[title][word[0]] += 1
                                        else:
                                            verb_model[title][word[0]] = 1
        computed_models[key] = verb_model
        
    return(computed_models)

In [None]:
#computed_models = getting_computed_valencies(contexts)

In [488]:
#save_to_json('computed models.json', computed_models) 

In [6]:
computed_models = get_from_json('computed models.json')

In [7]:
import csv

def csv_dict_writer(path, fieldnames, data): ## writes a CSV file using DictWriter
    with open(path, "w", newline='') as out_file:
        writer = csv.DictWriter(out_file, delimiter=';', fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

In [8]:
def save_valencies(computed_models):
    for verb, parse in computed_models.items():
        fieldnames = []
        data = []
        last_data = []
        if parse:
            for obj, exmpls in parse.items():
                fieldnames.append(obj)
                fieldnames.append('количество примеров для колонки "' + obj + '"')
                
                obj_data = []
                numb_data = []
                for word, numb in exmpls.items():
                    obj_data.append(word)
                    numb_data.append(str(numb))
                data.append(obj_data)
                data.append(numb_data)
                elements = [] 
                for arr in data: 
                    if len(arr) > 1:
                        element = '\n'.join(arr)
                    else:
                        element = arr[0]
                    elements.append(element)
                inner_dict = dict(zip(fieldnames, elements))
            last_data.append(inner_dict)
        csv_dict_writer('valencies/csv/Модель управления глагола ' + verb + '.csv', fieldnames, last_data)

In [9]:
save_valencies(computed_models)

In [35]:
def txt_save_valences(computed_models):
    for verb, parse in computed_models.items():
        if parse:
            with open('valencies/txt/Модель управления глагола ' + verb + '.txt', 'a', encoding = 'utf-8') as v:
                v.write('Модель управления глагола ' + verb + ':\n')
                for obj, exmpls in parse.items():
                    v.write(obj + ': ' )
                    for exmpl, value in exmpls.items():
                        v.write(exmpl + ' = ' + str(value) + '; ')
                    v.write('\n')

In [36]:
txt_save_valences(computed_models)