# Word2vec novels

Векторные романы: код для рерайтинга произведений русской литературы, развлекательного проекта [Бориса Орехова](http://nevmenandr.net/bo.php)

[Статья на Хабре](https://habr.com/ru/post/326380/)

In [1]:
import os
import re
import gensim
import pymorphy2

Slow version of gensim.models.doc2vec is being used


In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format("ruwikiruscorpora_0_300_20.bin.gz", binary=True)
model.init_sims(replace=True)

In [3]:
morph = pymorphy2.MorphAnalyzer()
punct = re.compile('^(.*?)([а-яА-ЯёЁ-]+)(.*?)$')
capit = re.compile('^[А-Я]+$')

In [10]:
pth_source = 'books_before/'
lst = os.listdir(pth_source)

pth_result = 'books_after/'

In [5]:
cotags = {'ADJF':'ADJ', # pymorphy2: word2vec 
'ADJS' : 'ADJ', 
'ADVB' : 'ADV', 
'COMP' : 'ADV', 
'GRND' : 'VERB', 
'INFN' : 'VERB', 
'NOUN' : 'NOUN', 
'PRED' : 'ADV', 
'PRTF' : 'ADJ', 
'PRTS' : 'VERB', 
'VERB' : 'VERB'}

In [6]:
#print(ord('А'))
#print(ord('Я'))
#print(ord('Ё'))
capit_letters = [chr(x) for x in range(1040,1072)] + ['Ё']

#### todo

* ~~capitalize~~
* ~~pos detection~~
* ~~cashing of word2vec queries~~
* ~~1st form extraction from pymorphy2 parse of word2vec response~~
* ~~pos matching for most similar words~~
* ~~names detection and excluding from the process~~
* ~~agreement in gender~~
* ~~yo-fication~~
* ~~voice~~

In [7]:
def search_neighbour(word, pos, gend='masc'):
    word = word.replace('ё', 'е')
    lex = word + '_' + cotags[pos]
    if lex in model:
        neighbs = model.most_similar([lex], topn=20)
        for nei in neighbs:
            lex_n, ps_n = nei[0].split('_')
            if '::' in lex_n:
                continue
            if cotags[pos] == ps_n:
                if pos == 'NOUN':
                    parse_result = morph.parse(lex_n)
                    for ana in parse_result:
                        if ana.normal_form == lex_n:
                            if ana.tag.gender == gend:
                                return lex_n
                elif cotags[pos] == 'VERB' and word[-2:] == 'ся':
                    if lex_n[-2:] == 'ся':
                        return lex_n
                elif cotags[pos] == 'VERB' and word[-2:] != 'ся':
                    if lex_n[-2:] != 'ся':
                        return lex_n
                else:
                    return lex_n
    return None

In [8]:
def flection(lex_neighb, tags):
    tags = str(tags)
    tags = re.sub(',[AGQSPMa-z-]+? ', ',', tags)
    tags = tags.replace("impf,", "")
    tags = re.sub('([A-Z]) (plur|masc|femn|neut|inan)', '\\1,\\2', tags)
    tags = tags.replace("Impe neut", "")
    tags = tags.split(',')
    tags_clean = []
    for t in tags:
        if t:
            if ' ' in t:
                t1, t2 = t.split(' ')
                t = t2
            tags_clean.append(t)
    tags = frozenset(tags_clean)
    prep_for_gen = morph.parse(lex_neighb)
    ana_array = []
    for ana in prep_for_gen:
        if ana.normal_form == lex_neighb:
            ana_array.append(ana)
    for ana in ana_array:
        try:
            flect = ana.inflect(tags)
        except:
            print(tags)
            return None
        if flect:
            word_to_replace = flect.word
            return word_to_replace
    return None   

In [104]:
model.most_similar(['холодный_ADJ'], topn=20)

[('теплый_ADJ', 0.6912842392921448),
 ('холод_NOUN', 0.6222048997879028),
 ('прохладный_ADJ', 0.6178681254386902),
 ('влажный_ADJ', 0.6009092926979065),
 ('жаркий_ADJ', 0.5878441333770752),
 ('горячий_ADJ', 0.5811808109283447),
 ('ледяной_ADJ', 0.5738131999969482),
 ('сухой_ADJ', 0.5681436657905579),
 ('сырой_ADJ', 0.5447033643722534),
 ('холодно_ADV', 0.5389620661735535),
 ('студеный_ADJ', 0.5354037880897522),
 ('морозный_ADJ', 0.5059537887573242),
 ('мокрый_ADJ', 0.49825477600097656),
 ('холодить_VERB', 0.49614548683166504),
 ('стыть_VERB', 0.49345287680625916),
 ('промозглый_ADJ', 0.49259310960769653),
 ('остыть_VERB', 0.4923018217086792),
 ('мягкий_ADJ', 0.4906276762485504),
 ('темный_ADJ', 0.48891666531562805),
 ('согревать_VERB', 0.4849362075328827)]

In [11]:
cash_neighb = {}

for fl in lst:
    if not fl.endswith('_JOF.txt'):
        continue
    print (fl)
    i = 0
    f = open(pth_source + fl, 'r', encoding='utf-8')
    fw = open(pth_result + '3.0_' + fl, 'w', encoding='utf-8')
    fs = open(pth_result + '3.0_Sample' + fl, 'w', encoding='utf-8')
    for line in f:
        new_line = []
        i += 1
        line = line.strip()
        words = line.split(' ')
        for word in words:
            struct = punct.findall(word)
            if struct:
                struct = struct[0]
            else:
                new_line.append(word)
                continue
            #print (struct)
            wordform = struct[1]
            if wordform:
                if capit.search(wordform):
                    new_line.append(word)
                    continue
                else:
                    if wordform[0] in capit_letters:
                        capit_flag = 1
                    else:
                        capit_flag = 0
                parse_result = morph.parse(wordform)[0]
                if 'Name' in parse_result.tag or 'Patr' in parse_result.tag:
                    new_line.append(word)
                    continue
                if parse_result.normal_form == 'глава':
                    new_line.append(word)
                    continue
                pos_flag = 0
                for tg in cotags:
                    if tg in parse_result.tag:
                        pos_flag = 1
                        lex = parse_result.normal_form
                        pos_tag = parse_result.tag.POS
                        if (lex, pos_tag) in cash_neighb:
                            lex_neighb = cash_neighb[(lex, pos_tag)]
                        else:
                            if pos_tag == 'NOUN':
                                gen_tag = parse_result.tag.gender
                                lex_neighb = search_neighbour(lex, pos_tag, gend=gen_tag)
                            else:
                                lex_neighb = search_neighbour(lex, pos_tag)
                            cash_neighb[(lex, pos_tag)] = lex_neighb
                        if not lex_neighb:
                            new_line.append(word)
                            break
                        else:
                            if pos_tag == 'NOUN':
                                if parse_result.tag.case == 'nomn' and parse_result.tag.number == 'sing':
                                    if capit_flag == 1:
                                        lex_neighb = lex_neighb.capitalize()
                                    new_line.append(struct[0] + lex_neighb + struct[2])
                                else:
                                    word_to_replace = flection(lex_neighb, parse_result.tag)
                                    if word_to_replace:
                                        if capit_flag == 1:
                                            word_to_replace = word_to_replace.capitalize()
                                        new_line.append(struct[0] + word_to_replace + struct[2])
                                    else:
                                        new_line.append(word)
                                    
                            elif pos_tag == 'ADJF':
                                if parse_result.tag.case == 'nomn' and parse_result.tag.number == 'sing':
                                    if capit_flag == 1:
                                        lex_neighb = lex_neighb.capitalize()
                                    new_line.append(struct[0] + lex_neighb + struct[2])
                                else:
                                    word_to_replace = flection(lex_neighb, parse_result.tag)
                                    if word_to_replace:
                                        if capit_flag == 1:
                                            word_to_replace = word_to_replace.capitalize()
                                        new_line.append(struct[0] + word_to_replace + struct[2])
                                    else:
                                        new_line.append(word)
                            
                            elif pos_tag == 'INFN':
                                if capit_flag == 1:
                                    lex_neighb = lex_neighb.capitalize()
                                new_line.append(struct[0] + lex_neighb + struct[2])
                            
                            elif pos_tag in ['ADVB', 'COMP', 'PRED']:
                                if capit_flag == 1:
                                    lex_neighb = lex_neighb.capitalize()
                                new_line.append(struct[0] + lex_neighb + struct[2])
                                
                            else:
                                word_to_replace = flection(lex_neighb, parse_result.tag)
                                if word_to_replace:
                                    if capit_flag == 1:
                                        word_to_replace = word_to_replace.capitalize()
                                    new_line.append(struct[0] + word_to_replace + struct[2])
                                else:
                                    new_line.append(word)
                        break
                if pos_flag == 0:
                    new_line.append(word)
            else:
                new_line.append(''.join(struct))
        line_replace = ' '.join(new_line)
        if i < 21:
            fs.write(line_replace + '\n')
        fw.write(line_replace + '\n')
    f.close()
    fw.close()
    fs.close()

EugeneOnegin_JOF.txt
FathersAndSons_JOF.txt
WarAndPeace_JOF.txt
MasterAndMargarita_JOF.txt
CrimeAndPunishment_JOF.txt


## Анна Каренина
### Отдельно по частям речи

In [19]:
pth_source = 'AK/'
lst = os.listdir(pth_source)

In [20]:
def appending(new_line, word):
    for k in new_line:
        new_line[k].append(word)
    return new_line

In [25]:
cash_neighb = {}

for fl in lst:
    print (fl)
    flc = fl.replace('.txt', '.tsv')
    i = 0
    f = open(pth_source + fl, 'r', encoding='utf-8')
    fw = open(pth_result + flc, 'w', encoding='utf-8')
    fs = open(pth_result + 'NOUN_' + flc, 'w', encoding='utf-8')
    fv = open(pth_result + 'VERB_' + flc, 'w', encoding='utf-8')
    for fh in [fw, fs, fv]:
        fh.write('Оригинал\tПотенциальный текст\n')
    for line in f:
        new_line = {'ALL': [], 'NOUN': [], 'VERB': []}
        i += 1
        line = line.strip()
        line = line.replace('\t', ' ')
        words = line.split(' ')
        for word in words:
            struct = punct.findall(word)
            if struct:
                struct = struct[0]
            else:
                new_line = appending(new_line, word)
                continue
            #print (struct)
            wordform = struct[1]
            if wordform:
                if capit.search(wordform):
                    new_line = appending(new_line, word)
                    continue
                else:
                    if wordform[0] in capit_letters:
                        capit_flag = 1
                    else:
                        capit_flag = 0
                parse_result = morph.parse(wordform)[0]
                if 'Name' in parse_result.tag or 'Patr' in parse_result.tag:
                    new_line = appending(new_line, word)
                    continue
                if parse_result.normal_form == 'глава':
                    new_line = appending(new_line, word)
                    continue
                pos_flag = 0
                for tg in cotags:
                    if tg in parse_result.tag:
                        pos_flag = 1
                        lex = parse_result.normal_form
                        pos_tag = parse_result.tag.POS
                        if (lex, pos_tag) in cash_neighb:
                            lex_neighb = cash_neighb[(lex, pos_tag)]
                        else:
                            if pos_tag == 'NOUN':
                                gen_tag = parse_result.tag.gender
                                lex_neighb = search_neighbour(lex, pos_tag, gend=gen_tag)
                            else:
                                lex_neighb = search_neighbour(lex, pos_tag)
                            cash_neighb[(lex, pos_tag)] = lex_neighb
                        if not lex_neighb:
                            new_line = appending(new_line, word)
                            break
                        else:
                            if pos_tag == 'NOUN':
                                if parse_result.tag.case == 'nomn' and parse_result.tag.number == 'sing':
                                    if capit_flag == 1:
                                        lex_neighb = lex_neighb.capitalize()
                                    new_line['ALL'].append(struct[0] + lex_neighb + struct[2])
                                    new_line['NOUN'].append(struct[0] + lex_neighb + struct[2])
                                    new_line['VERB'].append(word)
                                else:
                                    word_to_replace = flection(lex_neighb, parse_result.tag)
                                    if word_to_replace:
                                        if capit_flag == 1:
                                            word_to_replace = word_to_replace.capitalize()
                                        new_line['ALL'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['NOUN'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['VERB'].append(word)
                                    else:
                                        new_line = appending(new_line, word)
                                    
                            elif pos_tag == 'ADJF':
                                if parse_result.tag.case == 'nomn' and parse_result.tag.number == 'sing':
                                    if capit_flag == 1:
                                        lex_neighb = lex_neighb.capitalize()
                                    new_line['ALL'].append(struct[0] + lex_neighb + struct[2])
                                    new_line['NOUN'].append(struct[0] + lex_neighb + struct[2])
                                    new_line['VERB'].append(word)
                                else:
                                    word_to_replace = flection(lex_neighb, parse_result.tag)
                                    if word_to_replace:
                                        if capit_flag == 1:
                                            word_to_replace = word_to_replace.capitalize()
                                        new_line['ALL'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['NOUN'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['VERB'].append(word)
                                    else:
                                        new_line = appending(new_line, word)
                            
                            elif pos_tag == 'INFN':
                                if capit_flag == 1:
                                    lex_neighb = lex_neighb.capitalize()
                                new_line['ALL'].append(struct[0] + lex_neighb + struct[2])
                                new_line['VERB'].append(struct[0] + lex_neighb + struct[2])
                                new_line['NOUN'].append(word)
                            
                            elif pos_tag in ['ADVB', 'COMP', 'PRED']:
                                if capit_flag == 1:
                                    lex_neighb = lex_neighb.capitalize()
                                new_line['ALL'].append(struct[0] + lex_neighb + struct[2])
                                new_line['NOUN'].append(word)
                                new_line['VERB'].append(word)
                            else:
                                word_to_replace = flection(lex_neighb, parse_result.tag)
                                if word_to_replace:
                                    if capit_flag == 1:
                                        word_to_replace = word_to_replace.capitalize()
                                    if cotags[pos_tag] == 'VERB':
                                        new_line['ALL'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['VERB'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['NOUN'].append(word)
                                    elif cotags[pos_tag] == 'ADJ':
                                        new_line['ALL'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['NOUN'].append(struct[0] + word_to_replace + struct[2])
                                        new_line['VERB'].append(word)
                                    #new_line.append(struct[0] + word_to_replace + struct[2])
                                else:
                                    new_line = appending(new_line, word)
                        break
                if pos_flag == 0:
                    new_line = appending(new_line, word)
            else:
                new_line = appending(new_line, ''.join(struct))
        line_replace = ' '.join(new_line['ALL'])
        fw.write(line + '\t' + line_replace + '\n')
        line_replace = ' '.join(new_line['NOUN'])
        fs.write(line + '\t' + line_replace + '\n')
        line_replace = ' '.join(new_line['VERB'])
        fv.write(line + '\t' + line_replace + '\n')
    f.close()
    fw.close()
    fs.close()
    fv.close()

AnnaKarenina.txt


## Произведения школьной программы

In [9]:
pth_source = 'vector-school/'
lst = os.listdir(pth_source)

pth_result = 'vector-school-after/'

cash_neighb = {}

for fl in lst:
    if not fl.endswith('.txt'):
        continue
    print (fl)
    i = 0
    f = open(pth_source + fl, 'r', encoding='utf-8')
    fw = open(pth_result + 'vector_' + fl, 'w', encoding='utf-8')
    fs = open(pth_result + 'Sample' + fl, 'w', encoding='utf-8')
    for line in f:
        new_line = []
        i += 1
        line = line.strip()
        words = line.split(' ')
        for word in words:
            struct = punct.findall(word)
            if struct:
                struct = struct[0]
            else:
                new_line.append(word)
                continue
            #print (struct)
            wordform = struct[1]
            if wordform:
                if capit.search(wordform):
                    new_line.append(word)
                    continue
                else:
                    if wordform[0] in capit_letters:
                        capit_flag = 1
                    else:
                        capit_flag = 0
                parse_result = morph.parse(wordform)[0]
                if 'Name' in parse_result.tag or 'Patr' in parse_result.tag:
                    new_line.append(word)
                    continue
                if parse_result.normal_form == 'глава':
                    new_line.append(word)
                    continue
                pos_flag = 0
                for tg in cotags:
                    if tg in parse_result.tag:
                        pos_flag = 1
                        lex = parse_result.normal_form
                        pos_tag = parse_result.tag.POS
                        if (lex, pos_tag) in cash_neighb:
                            lex_neighb = cash_neighb[(lex, pos_tag)]
                        else:
                            if pos_tag == 'NOUN':
                                gen_tag = parse_result.tag.gender
                                lex_neighb = search_neighbour(lex, pos_tag, gend=gen_tag)
                            else:
                                lex_neighb = search_neighbour(lex, pos_tag)
                            cash_neighb[(lex, pos_tag)] = lex_neighb
                        if not lex_neighb:
                            new_line.append(word)
                            break
                        else:
                            if pos_tag == 'NOUN':
                                if parse_result.tag.case == 'nomn' and parse_result.tag.number == 'sing':
                                    if capit_flag == 1:
                                        lex_neighb = lex_neighb.capitalize()
                                    new_line.append(struct[0] + lex_neighb + struct[2])
                                else:
                                    word_to_replace = flection(lex_neighb, parse_result.tag)
                                    if word_to_replace:
                                        if capit_flag == 1:
                                            word_to_replace = word_to_replace.capitalize()
                                        new_line.append(struct[0] + word_to_replace + struct[2])
                                    else:
                                        new_line.append(word)
                                    
                            elif pos_tag == 'ADJF':
                                if parse_result.tag.case == 'nomn' and parse_result.tag.number == 'sing':
                                    if capit_flag == 1:
                                        lex_neighb = lex_neighb.capitalize()
                                    new_line.append(struct[0] + lex_neighb + struct[2])
                                else:
                                    word_to_replace = flection(lex_neighb, parse_result.tag)
                                    if word_to_replace:
                                        if capit_flag == 1:
                                            word_to_replace = word_to_replace.capitalize()
                                        new_line.append(struct[0] + word_to_replace + struct[2])
                                    else:
                                        new_line.append(word)
                            
                            elif pos_tag == 'INFN':
                                if capit_flag == 1:
                                    lex_neighb = lex_neighb.capitalize()
                                new_line.append(struct[0] + lex_neighb + struct[2])
                            
                            elif pos_tag in ['ADVB', 'COMP', 'PRED']:
                                if capit_flag == 1:
                                    lex_neighb = lex_neighb.capitalize()
                                new_line.append(struct[0] + lex_neighb + struct[2])
                                
                            else:
                                word_to_replace = flection(lex_neighb, parse_result.tag)
                                if word_to_replace:
                                    if capit_flag == 1:
                                        word_to_replace = word_to_replace.capitalize()
                                    new_line.append(struct[0] + word_to_replace + struct[2])
                                else:
                                    new_line.append(word)
                        break
                if pos_flag == 0:
                    new_line.append(word)
            else:
                new_line.append(''.join(struct))
        line_replace = ' '.join(new_line)
        if i < 21:
            fs.write(line_replace + '\n')
        fw.write(line_replace + '\n')
    f.close()
    fw.close()
    fs.close()

StaruhaIzergil.txt
VishnevyjSad.txt
BednajaLiza.txt
PovestiBelkina.txt
AlenkijCvetochek.txt
PervajaLyubov.txt
GerojNashegoVremeni.txt
KapitanskayaDochka.txt
Nos.txt
Revizor.txt
Nedorosl.txt
Asya.txt
MocartISaljeri.txt
Oblomov.txt
TriSestry.txt
PalataNomer6.txt
GoreOtUma.txt
Mcyri.txt
Shinel.txt
KonekGorbunok.txt
Chajka.txt
SkazkaOCareSaltane.txt
