In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ru-accent-poet
!pip install pandas

from ru_accent_poet import accent_line
import pandas
from tqdm import tqdm
tqdm.pandas()
import re

!pip install pymystem3
from pymystem3 import Mystem
ms = Mystem()

from sklearn.model_selection import train_test_split

In [None]:
accent_line('Это инструмент для разметки ударений')
#Э'то инструме'нт для разме'тки ударе'ний

"Э'то инструме'нт для разме'тки ударе'ний"

In [None]:
verse_data = pandas.read_csv('/content/drive/MyDrive/diplom/syllab-tonic-lines.tsv',sep='\t', quotechar='&')

In [None]:
df = pandas.read_csv('/content/drive/MyDrive/diplom/df_alg.csv', index_col=0, quotechar='&')

In [None]:
sum_data = pandas.read_csv('/content/drive/MyDrive/diplom/sum_data.csv', index_col=0, quotechar='&')

In [None]:
train, test = train_test_split(df, test_size=1000, random_state=156)

In [None]:
choree = '10'*100
iamb = '01'*100
dactyl = '100'*100
amphibrach = '010'*100
anapaest = '001'*100

meters = [choree, iamb, dactyl, amphibrach, anapaest]
#meters_name = ['choree', 'iamb', 'dactyl', 'amphibrach', 'anapaest']
meters_name = ['Х', 'Я', 'Д', 'Аф', 'Ан']

In [None]:
s = 'о̀'
ict = s[1]
print(ict)

̀


In [None]:
def verse_scheme_base(line):
    line = accent_line(line)
    line = re.sub(r'[аяоёуюыиэеАЯОЁУЮЫИЭЕ]\'', '1', line)
    # заменяем гласная+' на 1
    line = re.sub(r'[аяоёуюыиэеАЯОЁУЮЫИЭЕ]', '0', line)
    # заменяем оставшиеся гласные на нули
    line = re.sub(r'[^01]', '', line)
    # стираем всё кроме цифр
    #print(line)
    return line

In [None]:
test_lines_num = test['line_id'].unique()

In [None]:
def meter_match(line, ans, function_name='verse_scheme_base'):
    line = re.sub(ict, '', line)
    if function_name == 'verse_scheme_base':
        rhythm = verse_scheme_base(line)
    if function_name == 'verse_scheme_mono':
        rhythm = verse_scheme_mono(line)  
    scores = []
    for meter in meters:
        s = 0
        for i in range(len(rhythm)):
            if rhythm[i] != meter[i]:
                s += 1
        scores.append(s)
    val, idx = min((val, idx) for (idx, val) in enumerate(scores))
    return ans.startswith(meters_name[idx])

In [None]:
lines_test = pandas.DataFrame(test_lines_num, columns=['line_id'])

In [None]:
lines_test['line'], lines_test['meter'] = \
    zip(*lines_test.progress_apply(lambda x: (verse_data.loc[x.line_id, 'line'], verse_data.loc[x.line_id, 'clausula']), axis=1))

100%|██████████| 999/999 [00:00<00:00, 12090.40it/s]


In [None]:
lines_test

Unnamed: 0,line_id,line,meter
0,1189335,"За то̀, что ду̀х твой твѐрд, как ка̀мень,",Я4ж
1,1315753,Руна̀ золото̀го и го̀лого тѐла.,Аф4ж
2,1536129,"Шлѝ на дно̀, на дно̀, на дно̀..",Х4м
3,161091,Вѝдел со̀н Мушкѐт:,Х3м
4,1516377,"там, в партѐре!»",Ан1ж
...,...,...,...
994,31499,Что̀ зима̀ ведѐт к веснѐ?,Х4м
995,2364502,"Напра̀сно! ты̀ была̀ в объя̀тия̀х моѝх,",Я6м
996,444457,Типѝчный бы̀л лиха̀ч.,Я3м
997,329978,Он смо̀трит в по̀желтѐвшиѐ очкѝ...,Я5м


In [None]:
lines_test['meter_match'] = lines_test.progress_apply(lambda x: meter_match(x.line, x.meter), axis=1)

100%|██████████| 999/999 [03:57<00:00,  4.22it/s]


In [None]:
print(lines_test.groupby(by=['meter_match'])['meter_match'].count())

meter_match
False    202
True     797
Name: meter_match, dtype: int64


In [None]:
round(797/(797 + 202), 2)

0.8

вероятность без моносиллабов - 0.8

In [None]:
vowels = list('аяоёуюыиэеАЯОЁУЮЫИЭЕ')

def get_syll_type(word):
    if word[-1] in vowels:
        return 'open'
    else:
        return 'closed'

def count_vowels(word):
    s = 0
    global vowels
    for letter in word:
        if letter in vowels:
            s += 1
    return s

def count_syll(words, position=-1):
    line_segment = ''.join([word['text'] for word in words[:position]])
    return count_vowels(line_segment)

def get_syll_onset(word):
    if word[0] in vowels:
        return 'open'
    else:
        return 'closed'

In [None]:
get_syll_onset('за')

'closed'

In [None]:
def mystem_pos(word):
    if 'analysis' in word and len(word['analysis']) != 0:
        return word['analysis'][0]['gr'].split('=')[0].split(',')[0]
    else:
        return 'NA'

In [None]:
def neigh_data(words, position):

    if position > 1:
        left = words[position - 2]['text']
        left_neigh = (count_vowels(left) == 1)
        left_syll = get_syll_type(left)
    else:
        left_neigh = False
        left_syll = None
    
    try:
        right = words[position + 2]['text']
        right_neigh = (count_vowels(right) == 1)
        right_onset = get_syll_onset(right)
    except IndexError:
        right_neigh = False
        right_onset = None

    return left_neigh, left_syll, right_onset

In [None]:
def is_ict(POS_ms, left_syll, syll_onset, syll_type, right_onset, left_neigh, last_syll, second_last_syll):
    if last_syll == True:
        ict_prob = 0.95
    elif second_last_syll == True:
        ict_prob = 0.07
    else:
        res_df = sum_data[(sum_data['POS_ms'] == POS_ms) &
                          (sum_data['left_syll'] == left_syll) &
                          (sum_data['syll_onset'] == syll_onset) &
                          (sum_data['syll_type'] == syll_type) &
                          (sum_data['right_onset'] == right_onset) &
                          (sum_data['left_neigh'] == left_neigh)]
        if len(res_df) == 0:
            return 0.5
        elif len(res_df) == 1:
            if res_df['ict'].values[0] == True:
                return 1.0
            else:
                return 0.0

        ict_f, ict_t = res_df['word'].values
        ict_prob = round(ict_t/(ict_t + ict_f), 2)
    
    return ict_prob

In [None]:
ms = Mystem()

In [None]:
def verse_scheme_mono(line):
    # находим все односложные слова и расставляем там икты
    
    mono_ict_num = []
    words = ms.analyze(line)
            
    for i in range(len(words)):
        word_data = words[i]
        if count_vowels(word_data['text']) == 1:

            word = word_data['text'].lower()
            POS_ms = mystem_pos(word_data)

            syll_onset = get_syll_onset(word)
            syll_type = get_syll_type(word)
            left_neigh, left_syll, right_onset = neigh_data(words, i)
            
            syll_num = count_syll(words, i)
            num_reversed = count_syll(words) - syll_num
            last_syll = (num_reversed == 1)
            second_last_syll = (num_reversed == 2)
    
            if is_ict(POS_ms, left_syll, syll_onset, syll_type, right_onset, left_neigh, last_syll, second_last_syll) > 0.5:
                mono_ict_num.append(syll_num)

    line = accent_line(line)
    line = re.sub(r'[аяоёуюыиэеАЯОЁУЮЫИЭЕ]\'', '1', line)
    # заменяем гласная+' на 1
    line = re.sub(r'[аяоёуюыиэеАЯОЁУЮЫИЭЕ]', '0', line)
    # заменяем оставшиеся гласные на нули
    line = re.sub(r'[^01]', '', line)
    # стираем всё кроме цифр
    line = list(line)
    for i in mono_ict_num:
        line[i] = '1'

    return line

In [None]:
lines_test['meter_match_mono'] = lines_test.progress_apply(lambda x: meter_match(x.line, x.meter, function_name='verse_scheme_mono'), axis=1)

100%|██████████| 999/999 [04:03<00:00,  4.10it/s]


In [None]:
print(lines_test.groupby(by=['meter_match_mono'])['meter_match_mono'].count())

meter_match_mono
False     82
True     917
Name: meter_match_mono, dtype: int64


In [None]:
round(917/(917 + 82), 2)

0.92

## (Старое)

In [None]:
for i in test_lines_num[:5]:
    line = verse_data.loc[i, 'line']
    ans = verse_data.loc[i, 'clausula']
    line = re.sub(ict, '', line)
    print(line, ans)
    rhythm = verse_scheme_mono(line)
    print(rhythm)

За то, что дух твой тверд, как камень, Я4ж
['0', '0', '0', '0', '0', '0', '0', '1', '0']
['0', '1', '0', '1', '0', '1', '0', '1', '0']
Руна золотого и голого тела. Аф4ж
['0', '1', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0']
['0', '1', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0']
Шли на дно, на дно, на дно.. Х4м
['0', '0', '0', '0', '0', '0', '0']
['0', '0', '1', '0', '1', '0', '1']
Видел сон Мушкет: Х3м
['1', '0', '0', '0', '1']
['1', '0', '1', '0', '1']
там, в партере!» Ан1ж
['0', '0', '1', '0']
['0', '0', '1', '0']
