## Библиотеки

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas
import pandas

import time
import json
import re
!pip install tqdm
from tqdm import tqdm
from os import listdir

In [None]:
!pip install pymorphy3
from pymorphy3 import MorphAnalyzer
morph = MorphAnalyzer()

!pip install pymystem3
from pymystem3 import Mystem
ms = Mystem()

## Оригинальный датасет

In [None]:
verse_data = pandas.read_csv('/content/drive/MyDrive/diplom/syllab-tonic-lines.tsv',sep='\t', quotechar='&')

In [None]:
verse_data = pandas.read_csv('syllab-tonic-lines.tsv', sep='\t', quotechar='&')

In [None]:
verse_data

Unnamed: 0,clausula,line
0,Я5ж,За о̀пустѐвший сто̀л я вно̀вь садѝлся.
1,Я5м,"Тоску̀я, ду̀мал, ду̀мал о̀б одно̀м."
2,Я5ж,"В твоѐ окно̀ пото̀к черво̀нцев лѝлся,"
3,Я5м,ложѝлся на̀ пол зо̀лоты̀м пятно̀м...
4,Я5ж,"Каза̀лось мнѐ, что ты̀ придѐшь из са̀да"
...,...,...
2440702,Д3м,Нѐ услажда̀ют очѐй;
2440703,Д4д,"Ѝх нищета̀, их терпѐнье безмѐрное"
2440704,Д3м,То̀лько доса̀ду родѝт...
2440705,Д4д,"Что̀ же ты лю̀бишь, дитя̀ маловѐрное,"


## Препроцессинг

### Вспомогательные функции

In [None]:
vowels = list('аяоёуюыиэеАЯОЁУЮЫИЭЕ')

def has_vowels(word):
    global vowels
    for letter in word:
        if letter in vowels:
            return True
    return False


def count_vowels(word):
    s = 0
    global vowels
    for letter in word:
        if letter in vowels:
            s += 1
    return s

In [None]:
s = 'о̀'
ict = s[1]
print(ict)

̀


In [None]:
verse_data[verse_data['line'].isna()]

Unnamed: 0,clausula,line


In [None]:
def clean_punct(word):
    word = re.sub(r'\W', '', word)
    return word

def clean_sent(sent):
    sent = re.sub(r'\W+ ', ' ', sent)
    sent = re.sub(r' \W+', ' ', sent)
    return sent

In [None]:
clean_punct('пла̀чь:')

'плачь'

In [None]:
clean_sent('И -- нѐт')

'И нѐт'

### Фонетическая структура

In [None]:
def syll_type(word):
    if word[-1] in vowels:
        return 'open'
    else:
        return 'closed'

### Морфологическая разметка

In [None]:
def pymorphy_pos(word):
    return morph.parse(word)[0].tag.POS

In [None]:
def mystem_pos(word):
    if 'analysis' in word and len(word['analysis']) != 0:
        return word['analysis'][0]['gr'].split('=')[0].split(',')[0]
    else:
        return 'NA'

## Создание таблицы

In [None]:
mono_df = pandas.DataFrame(columns=['word', 'ict', 'line_id', 'position', 'clausula', 'POS_pm', 'POS_ms', 'syll_type'])

In [None]:
mono_df

Unnamed: 0,word,ict,line_id,position,clausula,POS_pm,POS_ms,syll_type


In [None]:
# mono_df = pandas.read_csv('/content/drive/MyDrive/diplom/mono_df.csv', index_col=0, quotechar='&')
#mono_df = mono_df.drop([142707, 142708])

In [None]:
verse_data.iloc[[536982]]

Unnamed: 0,clausula,line
536982,Аф4м,В себѐ заключа̀ют безмѐрный тала̀нт.


In [None]:
ms = Mystem()

In [None]:
j = -1
error_lines = []

a = 536982

while a < 2440707:
    mono_df = pandas.DataFrame(columns=['word', 'ict', 'line_id', 'position', 'clausula', 'POS_pm', 'POS_ms', 'syll_type'])

    for index, row in tqdm(verse_data.iloc[a:a+10000].iterrows()):
        try:
            ict_words = ms.lemmatize(row['line'])
            words = ms.analyze(row['line'].replace(ict, ''))
            
            for i in range(len(words)):
                if count_vowels(words[i]['text']) == 1:
                    word = words[i]['text'].lower()
                    
                    POS_pm = pymorphy_pos(word)
                    POS_ms = mystem_pos(words[i])
                    syll = syll_type(word)
                    
                    j += 1

                    if ict in ict_words[i]:
                        mono_df.loc[j] = [word, True, index, i, row['clausula'], POS_pm, POS_ms, syll]
                    else:
                        pass
                        mono_df.loc[j] = [word, False, index, i, row['clausula'], POS_pm, POS_ms, syll]
        
        except Exception as ex:
            template = "Ошибка {0} в строчке номер {1}. Arguments:\n{2!r}"
            message = template.format(type(ex).__name__, index, ex.args)
            print(message)
            error_lines.append(index)
    
    mono_df.to_csv(f'/content/drive/MyDrive/diplom/mono/mono_df_{a}.csv', quotechar='&')
    time.sleep(10)
    a += 10000

10000it [00:54, 183.78it/s]
10000it [00:50, 196.64it/s]
10000it [00:51, 192.77it/s]
10000it [00:50, 197.59it/s]
10000it [00:51, 193.24it/s]
10000it [00:52, 188.93it/s]
10000it [00:58, 171.29it/s]
10000it [01:05, 153.09it/s]
10000it [00:43, 230.64it/s]
10000it [00:43, 227.83it/s]
10000it [00:45, 218.08it/s]
10000it [00:46, 215.60it/s]
10000it [00:38, 260.26it/s]
3725it [00:15, 237.43it/s]


In [None]:
folder = '/content/drive/MyDrive/diplom/mono'

mono_df_all = pandas.read_csv('/content/drive/MyDrive/diplom/mono/mono_df_2426982.csv', index_col=0, quotechar='&')
mono_df_all = mono_df_all.iloc[0:0]

for filename in tqdm(listdir(folder)):
    new_df = pandas.read_csv(f'{folder}/{filename}', index_col=0, quotechar='&')
    mono_df_all = pandas.concat([mono_df_all, new_df], ignore_index=True, verify_integrity=True, sort=False)

100%|██████████| 194/194 [00:46<00:00,  4.17it/s]


In [None]:
mono_df_all

Unnamed: 0,word,ict,line_id,position,clausula,POS_pm,POS_ms,syll_type
0,за,False,0,0,Я5ж,PREP,PR,open
1,стол,True,0,4,Я5ж,NOUN,S,closed
2,я,False,0,6,Я5ж,NPRO,SPRO,open
3,вновь,True,0,8,Я5ж,ADVB,ADV,closed
4,об,True,1,6,Я5м,PREP,PR,closed
...,...,...,...,...,...,...,...,...
3694755,же,False,2440705,2,Д4д,PRCL,PART,open
3694756,ты,False,2440705,4,Д4д,NPRO,SPRO,open
3694757,где,True,2440706,0,Д3м,ADVB,ADVPRO,open
3694758,же,False,2440706,2,Д3м,PRCL,PART,open


In [None]:
mono_df_all.to_csv(f'/content/drive/MyDrive/diplom/mono_df_all.csv', quotechar='&')