This program gets the list of relevant verbs for texts on Geometry using TF-IDF metric.
Geometry texts were collected in 'getting corpus' program.
The texts collection was collected in from 'getting collection' program.
The verbs are saved into json, txt and csv files.

In [33]:
import pymorphy2 as pm2 
pmm = pm2.MorphAnalyzer() 
from pymystem3 import Mystem
m = Mystem()
import re
import os
import json

In [34]:
def pmm_get_all_verbs(text): ## getting all verbs and their numbers from the text
    verbs = {}
    n_text = re.sub(r'[^\w\s]','', text) 
    n_text = re.sub(r'\d', '', n_text) 
    n_text = re.sub(r'[A-Za-z]', '', n_text)
    for word in n_text.split():
        w_gr = str(pmm.tag(word)[0])
        pos = w_gr.split()[0].split(',')[0]
        if pos == 'VERB':
            verb = pmm.normal_forms(word)[0]
            if verb in verbs:
                verbs[verb] += 1
            else:
                verbs[verb] = 1
    return verbs

In [40]:
with open('all geometry.txt', 'r', encoding='utf-8') as t: ## got all verbs (from the TEST text!)
    text = t.read()
pmm_verbs = pmm_get_all_verbs(text)

In [37]:
def save_to_json(title, variable): ## saving a variable in json a file. title - name of the json file
    with open(title, "w", encoding = 'utf-8') as t:
        json.dump(variable, t)

In [38]:
def get_from_json(title): ## getting a variable from a json file. title - name of the json file
    with open(title, "r", encoding = 'utf-8') as t:
        variable = json.load(t)
    return variable

In [41]:
## saving all verbs into json file
save_to_json("verbs.json", pmm_verbs)

In [42]:
## getting all verbs from the json file
verbs = get_from_json("verbs.json")

In [44]:
def get_tfs(verbs): ## getting tfs for all the verbs
    tfs = []
    v_total = sum(verbs.values())
    for key, value in verbs.items():
        tf = value/v_total
        tfs.append((key, tf))
    return tfs

In [45]:
def get_lemmas(article): ## turning text into the list of its lemmas
    article = re.sub(r'[^\w\s]','', article) 
    article = re.sub(r'\d', '', article) 
    article = re.sub(r'[A-Za-z]', '', article)
    lem_article = [pmm.normal_forms(x)[0] for x in article.split()]
    return lem_article

In [46]:
def get_idfs(verbs, folder): ## getting idfs for all the verbs. folder - the folder containing the collection of the documents
    f_total = len([name for name in os.listdir(folder) if os.path.isfile(os.path.join(folder, name))]) 
    v_presence = {}
    for verb in verbs:
        v_presence[verb] = 1
    idfs = []
    for root, dirs, files in os.walk(folder):
        for name in files:
            if name != 'all geometry.txt': ##
                with open(os.path.join(folder, name), 'r', encoding='utf-8') as a:
                    article = a.read()
                lemmas = get_lemmas(article)
                for verb in verbs:
                    if verb in lemmas:
                        v_presence[verb] += 1
    for verb in v_presence:
        idf = f_total/v_presence[verb]
        idfs.append((verb, idf))
    return idfs

In [47]:
def get_tf_idf(verbs): ## getting tf-idf for all the verbs
    
    #with open('all geometry.txt', 'r', encoding='utf-8') as t:
    #    text = t.read()
    #verbs = get_all_verbs(text)
    tfs = get_tfs(verbs)
    idfs = get_idfs(verbs, 'test collection')
    tf_idfs = []
    for t, i in zip(tfs, idfs):
        tf_idf = t[1] * i[1]
        tf_idfs.append((t[0], tf_idf))
    return tf_idfs

In [52]:
tf_idfs = get_tf_idf(verbs) ## got tf-idfs

In [64]:
## saving all verbs into json file 
save_to_json("tf-idf.json", tf_idfs)

In [65]:
## getting all verbs from the json file 
tf_idfs = get_from_json("tf-idf.json")

In [51]:
tf_idfs

[['служить', 0.2959935466523259],
 ['способствовать', 0.012046248991664427],
 ['считаться', 0.060231244958322126],
 ['являться', 0.4000569413030068],
 ['подтверждать', 0.0022945236174598906],
 ['характеризовать', 0.006883570852379672],
 ['владеть', 0.003441785426189836],
 ['продвигать', 0.003441785426189836],
 ['оказываться', 0.06453347674105943],
 ['обладать', 0.3441785426189836],
 ['иметь', 0.32972304382898626],
 ['определяться', 0.21166980371067493],
 ['убеждать', 0.003441785426189836],
 ['относиться', 0.04072779420991306],
 ['происходить', 0.026387021600788742],
 ['измерять', 0.05162678139284754],
 ['означать', 0.36482925517612264],
 ['быть', 0.058510352245227215],
 ['находиться', 0.07675181500403334],
 ['требовать', 0.015832212960473248],
 ['свидетельствовать', 0.006883570852379672],
 ['привести', 0.06410325356278569],
 ['стать', 0.0016303194124057117],
 ['внести', 0.0024092497983328855],
 ['носить', 0.006883570852379672],
 ['становиться', 0.0189298198440441],
 ['подтвердить', 0.0

In [70]:
## making a csv table containing verbs and their tf-idfs
with open("verbs and tf-idf.csv", "a", encoding = 'utf-8') as v:
    v.write('verbs;tf-idf\n')
    for i in tf_idfs:
        v.write(i[0] + ';' + str(i[1]) + '\n')

In [60]:
def get_good_verbs(): ## collecting verbs with high score (relevant verbs) BUT TEST!
    good_verbs = []
    for i in tf_idfs:
        if i[1] > 0.16:
            good_verbs.append(i[0])
    return good_verbs

In [67]:
## saving the list of the relevant verbs in txt file
good_verbs = get_good_verbs()
with open('only relevant verbs.txt', 'w', encoding = 'utf-8') as v:
    for i in good_verbs:
        v.write(i + '\n')

In [68]:
## saving all verbs into json file
save_to_json("only relevant verbs.json", good_verbs)

In [69]:
## getting all verbs from json the file
good_verbs = get_from_json("only relevant verbs.json")

In [32]:
good_verbs

['служить',
 'способствовать',
 'вселять',
 'связывать',
 'владеть',
 'продвигать',
 'обладать',
 'определяться',
 'убеждать',
 'создавать',
 'измерять',
 'означать',
 'обусловливать',
 'решать',
 'доходить',
 'позволять',
 'свидетельствовать',
 'вычислять',
 'сохраняться',
 'поражать',
 'приводить',
 'накоплять',
 'становиться',
 'вносить',
 'базироваться',
 'выражаться',
 'состоять',
 'совершать',
 'основываться',
 'осуществлять',
 'начинать',
 'исчерпывать',
 'изучаться',
 'лежать',
 'располагать',
 'изображать']