In [2]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [1]:
import os
import re
import bz2
import csv
import json
import sys
import copy
import string
import gensim
import numpy as np
import collections
from pymystem3 import Mystem

from scipy.spatial.distance import cosine
from nltk.tokenize import word_tokenize

from utils import  Word2vecProcessor, PoemTemplateLoader
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATASETS_PATH = 'data'
word2vec = Word2vecProcessor(os.path.join(DATASETS_PATH, 'web_upos_cbow_300_20_2017.bin.gz'))

loaded lemmas 1418957


In [3]:
template_loader = PoemTemplateLoader(os.path.join(DATASETS_PATH, 'classic_poems.json'))

In [4]:
new_forms = {}
word_forms = {}
counts = {}
with open(os.path.join(DATASETS_PATH, 'word_stress_pos_full2.txt')) as f:
    for l in f:
        l = l.strip()
        word, key = l.split(',', 1)
        pk = key.split(',')
        accent = pk[-2] + ',' + pk[-1]
        key = key.replace('inan', '').replace('anim', '')
        if word not in counts:
            counts[word] = set()
        counts[word].add(accent)
        if key not in new_forms:
            new_forms[key] = []
        word_forms[word] = key
        new_forms[key].append(word)
        
skipwords = [w for w,c in counts.items() if len(c)>1]
skipwords = set(skipwords)

def sound_distance(word1, word2, suffix_len=3):    
    suffix1 = (' ' * suffix_len + word1)[-suffix_len:]
    suffix2 = (' ' * suffix_len + word2)[-suffix_len:]
    distance = sum((ch1 != ch2) for ch1, ch2 in zip(suffix1, suffix2))
    return distance        

def generate_poem(seed_vec, poet_id):
    skip = ['мой','мою','моя','мое','мне','моё','мной','тот','для','вот','все']

    # выбираем шаблон на основе случайного стихотворения из корпуса
    template = template_loader.get_random_template(poet_id)
    skip_poem = any([len(l) < 2 for l in template]) or template[0][0] in string.ascii_letters
    while skip_poem:
        template = template_loader.get_random_template(poet_id)
        skip_poem = any([len(l) < 2 for l in template]) or template[0][0] in string.ascii_letters

    poem = copy.deepcopy(template)


    used = set()
    replaced = 0
    total = 0
    skip_poem = any([len(l) < 2 for l in poem])
    # заменяем слова в шаблоне на более релевантные теме
    for li, line in enumerate(poem):
        llen = len(line) - 1
        for ti, token in enumerate(line):
            total += 1
            if not token.isalpha():
                continue

            word = token.lower()
            if len(word) < 3 or word[:3] == 'как' or word in skip:
                continue

            # выбираем слова - кандидаты на замену: максимально похожие фонетически на исходное слово
            if word in word_forms:
                form = word_forms[word]
            else:
                continue
#             form = phonetic.get_form(token)
            candidate_phonetic_distances = [
                (replacement_word, sound_distance(replacement_word, word))
                for replacement_word in new_forms[form]
                if replacement_word not in skipwords
                ]
            if not candidate_phonetic_distances:
                continue
            if ti == llen or (ti==llen-1 and line[ti+1] in ',.?!-:;'):
                min_phonetic_distance = min(d for w, d in candidate_phonetic_distances)
                replacement_candidates = [w for w, d in candidate_phonetic_distances 
                                          if d == min_phonetic_distance and w not in used]
            else:
                replacement_candidates = [w for w, d in candidate_phonetic_distances if w not in used]             
                
            # из кандидатов берем максимально близкое теме слово
            word2vec_distances = [
                (replacement_word, word2vec.distance(seed_vec, word2vec.word_vector(replacement_word)))
                for replacement_word in replacement_candidates
                ]
            word2vec_distances.sort(key=lambda pair: pair[1])
            
#             word2vec_distances = word2vec.distances(seed_vec, replacement_candidates)
            if not word2vec_distances:
                continue
            word2vec_nearest = [k for k,v in word2vec_distances[:3]]
            new_word = word2vec_nearest[0] # np.random.choice(word2vec_nearest)
            
            if poem[li][ti] != new_word:
                poem[li][ti] = new_word
                replaced += 1
                used.add(new_word)

    # собираем получившееся стихотворение из слов
    generated_poem = '\n'.join([' '.join([token for token in line]).capitalize() for line in poem])
    generated_poem = generated_poem.replace(' ,', ',').replace(' .', '.')\
        .replace(' ?', '?').replace(' !', '!').replace(' -', '-').replace(' :', ':').replace(' ;', ';')
    return generated_poem

The history saving thread hit an unexpected error (OperationalError('database or disk is full',)).History will not be written to the database.


In [5]:
skipwords=[]

In [6]:
def poem_ok(poem):
    lines = poem.split('\n')
    if len(lines) < 3 or len(lines) > 8:
        return False
    maxline = max([len(l) for l in lines])
    if maxline > 120:
        return False
    return True

In [7]:
pids = ['pushkin', 'esenin', 'mayakovskij', 'blok', 'tyutchev']

In [8]:
%lprun -f generate_poem generate_poem(seed_vec, poet_id)

UsageError: Line magic function `%lprun` not found.


In [9]:
%%time
seed = 'грузинский тост'
poet_id = 'tyutchev'

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [13]:
import kenlm
import time
import multiprocessing
model = kenlm.Model('data/lm.bin')
pool = multiprocessing.Pool(processes=4)

def worker(arg):
    return generate_poem(arg[0], arg[1])

def best_poem(seed, poet_id):

    seed_vec = word2vec.text_vector(seed)
    pset = set()
    poems = []
    probs = []
    dists= []
    dt = time.monotonic()
    for p in pool.imap_unordered(worker, [(seed_vec, poet_id) for _ in range(40)]):
        if not poem_ok(p) or p in pset:
            continue
        text = p.replace('\n', ' ')
        text = text.replace(',', '').replace('.', '')\
        .replace('?', '').replace('!', '').replace(':', '').replace(';', '').lower()
        n_words = len(text.split())
        s = model.score(text, bos = True, eos = True) / n_words
        probs.append(s)
        p_vec = word2vec.text_vector(p)
        d = word2vec.distance(seed_vec, p_vec)
        dists.append(d)
        poems.append(p)
        pset.add(p)
        if time.monotonic() - dt > 4.:
            break
    print('poems', len(poems))
    probs = np.asarray(probs)
    probs /= np.sum(probs)
    dists = np.asarray(dists)
    sums = probs+dists
    i = np.argmin(sums)
    return poems[i]

In [16]:
%%time
for i in tqdm_notebook(range(10)):
    poet_id = np.random.choice(pids)
    print(poet_id)
    p = best_poem(seed, poet_id)
    if not poem_ok(p):
        print('---BAD---')
        break
    print(p)
    break

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

blok
poems 21
Здравица, покойная калмычка!
Мне застолен каждый чей намек,
Хмельная грузинская шипучка
Всеми тостами грузинских строк!
Все вина как сливовицы сала,
Все слова как дружеская шаль!
Тостом откушанного бокала
Лезвее хмельную, остря в даль

CPU times: user 77.1 ms, sys: 63 µs, total: 77.2 ms
Wall time: 4.16 s
