In [1]:
import os
import re
from tqdm.auto import tqdm
#from tqdm import tqdm

In [2]:
vowlist = ['А', 'Е', 'Ё', 'И', 'О', 'У', 'Ы', 'Э', 'Ю', 'Я', 'а', 'е', 'ё', 'и', 'о', 'у', 'ы', 'э', 'ю', 'я']
# forlemmatization patterns
poempattern = re.compile('author>(.+?)<review num', re.MULTILINE|re.DOTALL)
tspattern = re.compile('[\t|\s]+')
linkspattern = re.compile('<a href.+?</a>')
# cleanlemmatized
notworddigittabhyph = re.compile('[^\w0-9-\t]')
underscore = re.compile('_')
multiplehyphens = re.compile('-+?')
righthyphen = re.compile(' -')
lefthyphen = re.compile('- ')
hyphen = re.compile(' - ')
multiplespace = re.compile(' +')
cyr_non_ru =list('іґєїўІҐЄЇЎ')
eng_alph = list('abcdefghijklmnopqrstuvwxyz')
ru_alph = list('абвгдеёжзийклмнопрстуфхцчшщьъыэюя')

In [3]:
def check_line(line):
    
    vowel_count = 0
    i_count = 0
    rus_count = 0
    latin_count = 0
    for num, symbol in enumerate(line):
        if symbol in cyr_non_ru:
            return None
        elif symbol.lower() in eng_alph: #latin - with distinct russian 
            latin_count +=1
        elif symbol in vowlist and vowel_count<17:
            vowel_count+=1
        elif vowel_count == 17:
            return None
        elif symbol.lower() in ru_alph:
            rus_count +=1
            
    if latin_count > 0 and rus_count == 0:
        return None
        
    line = re.sub('[^\w\d’ -]+', '.', line)
    return re.sub('-+', '-', line)

In [4]:
# lemmatizized2end
def poemclean(lemmatized, outputfname):
    with open(lemmatized, 'r', encoding='utf-8') as f:
        text = f.read()
    with open (outputfname, 'w+', encoding='utf-8') as output:
        for line in text.splitlines():
            clean = notworddigittabhyph.sub(' ', line)
            clean = underscore.sub('', clean)
            clean = multiplehyphens.sub('-', clean)
            clean = righthyphen.sub(' ', clean)
            clean = lefthyphen.sub(' ', clean)
            clean = multiplespace.sub(' ', clean)
            output.write(clean.strip(' ') + '\n')

In [11]:
yearpath = r'D:\stihi\2018'
monthpaths = [root for root, _, _ in os.walk(yearpath) if len(root) == len(yearpath) + 3]
for monthpath in monthpaths:
    folders = [root for root, _, _ in os.walk(monthpath)]
    year, month = monthpath.split('\\')[-2:]
    forlem = f'filtered-{year}-{month}.txt'
    with open(forlem, 'w', encoding='utf-8') as output:
        for path in folders[1:]:
            _, _, day = path.split('\\')[-3:]
            for root, _, files in os.walk(path):
                for poemfile in tqdm(files):
                    with open(os.path.join(root, poemfile), 'r', encoding='utf-8') as f:
                        poem = f.read()
                    #text = poempattern.search(poem) # <2018
                    text= poem #
                    if text is not None:
                        #text = text.group(1)#
                        shortlines = ''
                        whole = ''
                        checknum = 0
                        text = re.sub('[ \f\r\t\v]+', ' ', text).strip(' ')
                        for line in text.splitlines():
                            checked = check_line(line)
                            if checked:
                                shortlines += checked + ' '
                                whole += line + ' '
                                checknum+=1
                        whole = re.sub('<a href.+?</a>', '', whole)
                        whole = re.sub('a? href.+?', '', whole)
                        whole = re.sub('http.+ ', ' ', whole).strip(" ")
                        shortlines = re.sub('<a href.+?</a>', '', shortlines)
                        shortlines = re.sub('a? href.+?', '', shortlines)
                        shortlines = re.sub('http.+ ', ' ', shortlines)
                        clean = underscore.sub('', shortlines)
                        clean = multiplehyphens.sub('-', clean)
                        clean = righthyphen.sub(' ', clean)
                        clean = lefthyphen.sub(' ', clean)
                        shortlines = multiplespace.sub(' ', clean)
                        if shortlines and checknum<101 and any(char.isalpha() for char in shortlines):
                            output.write(f'{year}\t{month}\t{day}\t{whole.strip(" ")}\n')

  0%|          | 0/9470 [00:00<?, ?it/s]

  0%|          | 0/10970 [00:00<?, ?it/s]

  0%|          | 0/11285 [00:00<?, ?it/s]

  0%|          | 0/12076 [00:00<?, ?it/s]

  0%|          | 0/12240 [00:00<?, ?it/s]

  0%|          | 0/12443 [00:00<?, ?it/s]

  0%|          | 0/12363 [00:00<?, ?it/s]

  0%|          | 0/12840 [00:00<?, ?it/s]

  0%|          | 0/12491 [00:00<?, ?it/s]

  0%|          | 0/12812 [00:00<?, ?it/s]

  0%|          | 0/12293 [00:00<?, ?it/s]

  0%|          | 0/12112 [00:00<?, ?it/s]

  0%|          | 0/12514 [00:00<?, ?it/s]

  0%|          | 0/12754 [00:00<?, ?it/s]

  0%|          | 0/12642 [00:00<?, ?it/s]

  0%|          | 0/12442 [00:00<?, ?it/s]

  0%|          | 0/12608 [00:00<?, ?it/s]

  0%|          | 0/7709 [00:00<?, ?it/s]

  0%|          | 0/12588 [00:00<?, ?it/s]

  0%|          | 0/12906 [00:00<?, ?it/s]

  0%|          | 0/13298 [00:00<?, ?it/s]

  0%|          | 0/12993 [00:00<?, ?it/s]

  0%|          | 0/12869 [00:00<?, ?it/s]

  0%|          | 0/13078 [00:00<?, ?it/s]

  0%|          | 0/13369 [00:00<?, ?it/s]

  0%|          | 0/12586 [00:00<?, ?it/s]

  0%|          | 0/12928 [00:00<?, ?it/s]

  0%|          | 0/13507 [00:00<?, ?it/s]

  0%|          | 0/13077 [00:00<?, ?it/s]

  0%|          | 0/12565 [00:00<?, ?it/s]

  0%|          | 0/13119 [00:00<?, ?it/s]

  0%|          | 0/12366 [00:00<?, ?it/s]

  0%|          | 0/12347 [00:00<?, ?it/s]

  0%|          | 0/12835 [00:00<?, ?it/s]

  0%|          | 0/13248 [00:00<?, ?it/s]

  0%|          | 0/12696 [00:00<?, ?it/s]

  0%|          | 0/12453 [00:00<?, ?it/s]

  0%|          | 0/12424 [00:00<?, ?it/s]

  0%|          | 0/12394 [00:00<?, ?it/s]

  0%|          | 0/12462 [00:00<?, ?it/s]

  0%|          | 0/12582 [00:00<?, ?it/s]

  0%|          | 0/13000 [00:00<?, ?it/s]

  0%|          | 0/12557 [00:00<?, ?it/s]

  0%|          | 0/12447 [00:00<?, ?it/s]

  0%|          | 0/12039 [00:00<?, ?it/s]

  0%|          | 0/12126 [00:00<?, ?it/s]

  0%|          | 0/11697 [00:00<?, ?it/s]

  0%|          | 0/12108 [00:00<?, ?it/s]

  0%|          | 0/12901 [00:00<?, ?it/s]

  0%|          | 0/12337 [00:00<?, ?it/s]

  0%|          | 0/11978 [00:00<?, ?it/s]

  0%|          | 0/11504 [00:00<?, ?it/s]

  0%|          | 0/11877 [00:00<?, ?it/s]

  0%|          | 0/11970 [00:00<?, ?it/s]

  0%|          | 0/12304 [00:00<?, ?it/s]

  0%|          | 0/12778 [00:00<?, ?it/s]

  0%|          | 0/12227 [00:00<?, ?it/s]

  0%|          | 0/12283 [00:00<?, ?it/s]

  0%|          | 0/12375 [00:00<?, ?it/s]

  0%|          | 0/12397 [00:00<?, ?it/s]

  0%|          | 0/11943 [00:00<?, ?it/s]

  0%|          | 0/12678 [00:00<?, ?it/s]

  0%|          | 0/13285 [00:00<?, ?it/s]

  0%|          | 0/12416 [00:00<?, ?it/s]

  0%|          | 0/12183 [00:00<?, ?it/s]

  0%|          | 0/12595 [00:00<?, ?it/s]

  0%|          | 0/11874 [00:00<?, ?it/s]

  0%|          | 0/12083 [00:00<?, ?it/s]

  0%|          | 0/12349 [00:00<?, ?it/s]

  0%|          | 0/13082 [00:00<?, ?it/s]

  0%|          | 0/11928 [00:00<?, ?it/s]

  0%|          | 0/12584 [00:00<?, ?it/s]

  0%|          | 0/11896 [00:00<?, ?it/s]

  0%|          | 0/11895 [00:00<?, ?it/s]

  0%|          | 0/11878 [00:00<?, ?it/s]

  0%|          | 0/11886 [00:00<?, ?it/s]

  0%|          | 0/11907 [00:00<?, ?it/s]

  0%|          | 0/11948 [00:00<?, ?it/s]

  0%|          | 0/12041 [00:00<?, ?it/s]

  0%|          | 0/12520 [00:00<?, ?it/s]

  0%|          | 0/12152 [00:00<?, ?it/s]

  0%|          | 0/12253 [00:00<?, ?it/s]

  0%|          | 0/11905 [00:00<?, ?it/s]

  0%|          | 0/12444 [00:00<?, ?it/s]

  0%|          | 0/12178 [00:00<?, ?it/s]

  0%|          | 0/12434 [00:00<?, ?it/s]

  0%|          | 0/12138 [00:00<?, ?it/s]

  0%|          | 0/12109 [00:00<?, ?it/s]

  0%|          | 0/11885 [00:00<?, ?it/s]

  0%|          | 0/12020 [00:00<?, ?it/s]

  0%|          | 0/12801 [00:00<?, ?it/s]

  0%|          | 0/12108 [00:00<?, ?it/s]

  0%|          | 0/11713 [00:00<?, ?it/s]

  0%|          | 0/11494 [00:00<?, ?it/s]

  0%|          | 0/11536 [00:00<?, ?it/s]

  0%|          | 0/11251 [00:00<?, ?it/s]

  0%|          | 0/11387 [00:00<?, ?it/s]

  0%|          | 0/10954 [00:00<?, ?it/s]

  0%|          | 0/11168 [00:00<?, ?it/s]

  0%|          | 0/11031 [00:00<?, ?it/s]

  0%|          | 0/11023 [00:00<?, ?it/s]

  0%|          | 0/11434 [00:00<?, ?it/s]

  0%|          | 0/11138 [00:00<?, ?it/s]

  0%|          | 0/10739 [00:00<?, ?it/s]

  0%|          | 0/9880 [00:00<?, ?it/s]

  0%|          | 0/11063 [00:00<?, ?it/s]

  0%|          | 0/11139 [00:00<?, ?it/s]

  0%|          | 0/11192 [00:00<?, ?it/s]

  0%|          | 0/11166 [00:00<?, ?it/s]

  0%|          | 0/11302 [00:00<?, ?it/s]

  0%|          | 0/11170 [00:00<?, ?it/s]

  0%|          | 0/11623 [00:00<?, ?it/s]

  0%|          | 0/11316 [00:00<?, ?it/s]

  0%|          | 0/11167 [00:00<?, ?it/s]

  0%|          | 0/10855 [00:00<?, ?it/s]

  0%|          | 0/10952 [00:00<?, ?it/s]

  0%|          | 0/10718 [00:00<?, ?it/s]

  0%|          | 0/10534 [00:00<?, ?it/s]

  0%|          | 0/9974 [00:00<?, ?it/s]

  0%|          | 0/9967 [00:00<?, ?it/s]

  0%|          | 0/9939 [00:00<?, ?it/s]

  0%|          | 0/10056 [00:00<?, ?it/s]

  0%|          | 0/10068 [00:00<?, ?it/s]

  0%|          | 0/10380 [00:00<?, ?it/s]

  0%|          | 0/10137 [00:00<?, ?it/s]

  0%|          | 0/10207 [00:00<?, ?it/s]

  0%|          | 0/10364 [00:00<?, ?it/s]

  0%|          | 0/10483 [00:00<?, ?it/s]

  0%|          | 0/9527 [00:00<?, ?it/s]

  0%|          | 0/9920 [00:00<?, ?it/s]

  0%|          | 0/10118 [00:00<?, ?it/s]

  0%|          | 0/9481 [00:00<?, ?it/s]

  0%|          | 0/9213 [00:00<?, ?it/s]

  0%|          | 0/9767 [00:00<?, ?it/s]

  0%|          | 0/10090 [00:00<?, ?it/s]

  0%|          | 0/9861 [00:00<?, ?it/s]

  0%|          | 0/9873 [00:00<?, ?it/s]

  0%|          | 0/9811 [00:00<?, ?it/s]

  0%|          | 0/9653 [00:00<?, ?it/s]

  0%|          | 0/9772 [00:00<?, ?it/s]

  0%|          | 0/10149 [00:00<?, ?it/s]

  0%|          | 0/9752 [00:00<?, ?it/s]

  0%|          | 0/9834 [00:00<?, ?it/s]

  0%|          | 0/9819 [00:00<?, ?it/s]

  0%|          | 0/9731 [00:00<?, ?it/s]

  0%|          | 0/9427 [00:00<?, ?it/s]

  0%|          | 0/9403 [00:00<?, ?it/s]

  0%|          | 0/9956 [00:00<?, ?it/s]

  0%|          | 0/9961 [00:00<?, ?it/s]

  0%|          | 0/9838 [00:00<?, ?it/s]

  0%|          | 0/9833 [00:00<?, ?it/s]

  0%|          | 0/10053 [00:00<?, ?it/s]

  0%|          | 0/9438 [00:00<?, ?it/s]

  0%|          | 0/9650 [00:00<?, ?it/s]

  0%|          | 0/9728 [00:00<?, ?it/s]

  0%|          | 0/10016 [00:00<?, ?it/s]

  0%|          | 0/9964 [00:00<?, ?it/s]

  0%|          | 0/9649 [00:00<?, ?it/s]

  0%|          | 0/9556 [00:00<?, ?it/s]

  0%|          | 0/9105 [00:00<?, ?it/s]

  0%|          | 0/9260 [00:00<?, ?it/s]

  0%|          | 0/9106 [00:00<?, ?it/s]

  0%|          | 0/9287 [00:00<?, ?it/s]

  0%|          | 0/9495 [00:00<?, ?it/s]

  0%|          | 0/9370 [00:00<?, ?it/s]

  0%|          | 0/9002 [00:00<?, ?it/s]

  0%|          | 0/8778 [00:00<?, ?it/s]

  0%|          | 0/8925 [00:00<?, ?it/s]

  0%|          | 0/9138 [00:00<?, ?it/s]

  0%|          | 0/9357 [00:00<?, ?it/s]

  0%|          | 0/9189 [00:00<?, ?it/s]

  0%|          | 0/9517 [00:00<?, ?it/s]

  0%|          | 0/9445 [00:00<?, ?it/s]

  0%|          | 0/8750 [00:00<?, ?it/s]

  0%|          | 0/9203 [00:00<?, ?it/s]

  0%|          | 0/9420 [00:00<?, ?it/s]

  0%|          | 0/9453 [00:00<?, ?it/s]

  0%|          | 0/9169 [00:00<?, ?it/s]

  0%|          | 0/9073 [00:00<?, ?it/s]

  0%|          | 0/9045 [00:00<?, ?it/s]

  0%|          | 0/9205 [00:00<?, ?it/s]

  0%|          | 0/8979 [00:00<?, ?it/s]

  0%|          | 0/9418 [00:00<?, ?it/s]

  0%|          | 0/9181 [00:00<?, ?it/s]

  0%|          | 0/9360 [00:00<?, ?it/s]

  0%|          | 0/9326 [00:00<?, ?it/s]

  0%|          | 0/9137 [00:00<?, ?it/s]

  0%|          | 0/8547 [00:00<?, ?it/s]

  0%|          | 0/8864 [00:00<?, ?it/s]

  0%|          | 0/9303 [00:00<?, ?it/s]

  0%|          | 0/9113 [00:00<?, ?it/s]

  0%|          | 0/9093 [00:00<?, ?it/s]

  0%|          | 0/8920 [00:00<?, ?it/s]

  0%|          | 0/9178 [00:00<?, ?it/s]

  0%|          | 0/8586 [00:00<?, ?it/s]

  0%|          | 0/8668 [00:00<?, ?it/s]

  0%|          | 0/9259 [00:00<?, ?it/s]

  0%|          | 0/9044 [00:00<?, ?it/s]

  0%|          | 0/9230 [00:00<?, ?it/s]

  0%|          | 0/9389 [00:00<?, ?it/s]

  0%|          | 0/9159 [00:00<?, ?it/s]

  0%|          | 0/8920 [00:00<?, ?it/s]

  0%|          | 0/8974 [00:00<?, ?it/s]

  0%|          | 0/9590 [00:00<?, ?it/s]

  0%|          | 0/9280 [00:00<?, ?it/s]

  0%|          | 0/9361 [00:00<?, ?it/s]

  0%|          | 0/9146 [00:00<?, ?it/s]

  0%|          | 0/9041 [00:00<?, ?it/s]

  0%|          | 0/8735 [00:00<?, ?it/s]

  0%|          | 0/8789 [00:00<?, ?it/s]

  0%|          | 0/9194 [00:00<?, ?it/s]

  0%|          | 0/9256 [00:00<?, ?it/s]

  0%|          | 0/9228 [00:00<?, ?it/s]

  0%|          | 0/9319 [00:00<?, ?it/s]

  0%|          | 0/9392 [00:00<?, ?it/s]

  0%|          | 0/8916 [00:00<?, ?it/s]

  0%|          | 0/9022 [00:00<?, ?it/s]

  0%|          | 0/9209 [00:00<?, ?it/s]

  0%|          | 0/9369 [00:00<?, ?it/s]

  0%|          | 0/9201 [00:00<?, ?it/s]

  0%|          | 0/9215 [00:00<?, ?it/s]

  0%|          | 0/9163 [00:00<?, ?it/s]

  0%|          | 0/9010 [00:00<?, ?it/s]

  0%|          | 0/9178 [00:00<?, ?it/s]

  0%|          | 0/9400 [00:00<?, ?it/s]

  0%|          | 0/9169 [00:00<?, ?it/s]

  0%|          | 0/9653 [00:00<?, ?it/s]

  0%|          | 0/9130 [00:00<?, ?it/s]

  0%|          | 0/8842 [00:00<?, ?it/s]

  0%|          | 0/8671 [00:00<?, ?it/s]

  0%|          | 0/8967 [00:00<?, ?it/s]

  0%|          | 0/9634 [00:00<?, ?it/s]

  0%|          | 0/9798 [00:00<?, ?it/s]

  0%|          | 0/9920 [00:00<?, ?it/s]

  0%|          | 0/9666 [00:00<?, ?it/s]

  0%|          | 0/9577 [00:00<?, ?it/s]

  0%|          | 0/8615 [00:00<?, ?it/s]

  0%|          | 0/9090 [00:00<?, ?it/s]

  0%|          | 0/9539 [00:00<?, ?it/s]

  0%|          | 0/9231 [00:00<?, ?it/s]

  0%|          | 0/9547 [00:00<?, ?it/s]

  0%|          | 0/9696 [00:00<?, ?it/s]

  0%|          | 0/9759 [00:00<?, ?it/s]

  0%|          | 0/9282 [00:00<?, ?it/s]

  0%|          | 0/9117 [00:00<?, ?it/s]

  0%|          | 0/9166 [00:00<?, ?it/s]

  0%|          | 0/9039 [00:00<?, ?it/s]

  0%|          | 0/9320 [00:00<?, ?it/s]

  0%|          | 0/9318 [00:00<?, ?it/s]

  0%|          | 0/9381 [00:00<?, ?it/s]

  0%|          | 0/9275 [00:00<?, ?it/s]

  0%|          | 0/9421 [00:00<?, ?it/s]

  0%|          | 0/9144 [00:00<?, ?it/s]

  0%|          | 0/9872 [00:00<?, ?it/s]

  0%|          | 0/9485 [00:00<?, ?it/s]

  0%|          | 0/9977 [00:00<?, ?it/s]

  0%|          | 0/9540 [00:00<?, ?it/s]

  0%|          | 0/9077 [00:00<?, ?it/s]

  0%|          | 0/9846 [00:00<?, ?it/s]

  0%|          | 0/9787 [00:00<?, ?it/s]

  0%|          | 0/9823 [00:00<?, ?it/s]

  0%|          | 0/9893 [00:00<?, ?it/s]

  0%|          | 0/9500 [00:00<?, ?it/s]

  0%|          | 0/9867 [00:00<?, ?it/s]

  0%|          | 0/9491 [00:00<?, ?it/s]

  0%|          | 0/10033 [00:00<?, ?it/s]

  0%|          | 0/10186 [00:00<?, ?it/s]

  0%|          | 0/10156 [00:00<?, ?it/s]

  0%|          | 0/9764 [00:00<?, ?it/s]

  0%|          | 0/10015 [00:00<?, ?it/s]

  0%|          | 0/9955 [00:00<?, ?it/s]

  0%|          | 0/9549 [00:00<?, ?it/s]

  0%|          | 0/10274 [00:00<?, ?it/s]

  0%|          | 0/10225 [00:00<?, ?it/s]

  0%|          | 0/9727 [00:00<?, ?it/s]

  0%|          | 0/9890 [00:00<?, ?it/s]

  0%|          | 0/10096 [00:00<?, ?it/s]

  0%|          | 0/9891 [00:00<?, ?it/s]

  0%|          | 0/9721 [00:00<?, ?it/s]

  0%|          | 0/10275 [00:00<?, ?it/s]

  0%|          | 0/10105 [00:00<?, ?it/s]

  0%|          | 0/9935 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/9603 [00:00<?, ?it/s]

  0%|          | 0/9853 [00:00<?, ?it/s]

  0%|          | 0/9954 [00:00<?, ?it/s]

  0%|          | 0/10168 [00:00<?, ?it/s]

  0%|          | 0/10257 [00:00<?, ?it/s]

  0%|          | 0/10167 [00:00<?, ?it/s]

  0%|          | 0/9594 [00:00<?, ?it/s]

  0%|          | 0/9989 [00:00<?, ?it/s]

  0%|          | 0/9807 [00:00<?, ?it/s]

  0%|          | 0/9924 [00:00<?, ?it/s]

  0%|          | 0/10579 [00:00<?, ?it/s]

  0%|          | 0/10386 [00:00<?, ?it/s]

  0%|          | 0/10553 [00:00<?, ?it/s]

  0%|          | 0/10026 [00:00<?, ?it/s]

  0%|          | 0/10503 [00:00<?, ?it/s]

  0%|          | 0/10478 [00:00<?, ?it/s]

  0%|          | 0/10093 [00:00<?, ?it/s]

  0%|          | 0/10314 [00:00<?, ?it/s]

  0%|          | 0/10273 [00:00<?, ?it/s]

  0%|          | 0/10106 [00:00<?, ?it/s]

  0%|          | 0/10234 [00:00<?, ?it/s]

  0%|          | 0/10085 [00:00<?, ?it/s]

  0%|          | 0/10154 [00:00<?, ?it/s]

  0%|          | 0/9957 [00:00<?, ?it/s]

  0%|          | 0/10170 [00:00<?, ?it/s]

  0%|          | 0/10361 [00:00<?, ?it/s]

  0%|          | 0/10252 [00:00<?, ?it/s]

  0%|          | 0/10521 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10223 [00:00<?, ?it/s]

  0%|          | 0/10379 [00:00<?, ?it/s]

  0%|          | 0/10726 [00:00<?, ?it/s]

  0%|          | 0/10501 [00:00<?, ?it/s]

  0%|          | 0/10277 [00:00<?, ?it/s]

  0%|          | 0/10100 [00:00<?, ?it/s]

  0%|          | 0/10327 [00:00<?, ?it/s]

  0%|          | 0/10041 [00:00<?, ?it/s]

  0%|          | 0/10297 [00:00<?, ?it/s]

  0%|          | 0/10847 [00:00<?, ?it/s]

  0%|          | 0/10728 [00:00<?, ?it/s]

  0%|          | 0/10527 [00:00<?, ?it/s]

  0%|          | 0/10273 [00:00<?, ?it/s]

  0%|          | 0/10371 [00:00<?, ?it/s]

  0%|          | 0/10353 [00:00<?, ?it/s]

  0%|          | 0/10610 [00:00<?, ?it/s]

  0%|          | 0/10745 [00:00<?, ?it/s]

  0%|          | 0/10599 [00:00<?, ?it/s]

  0%|          | 0/10883 [00:00<?, ?it/s]

  0%|          | 0/10834 [00:00<?, ?it/s]

  0%|          | 0/10310 [00:00<?, ?it/s]

  0%|          | 0/10440 [00:00<?, ?it/s]

  0%|          | 0/11104 [00:00<?, ?it/s]

  0%|          | 0/10868 [00:00<?, ?it/s]

  0%|          | 0/10487 [00:00<?, ?it/s]

  0%|          | 0/10447 [00:00<?, ?it/s]

  0%|          | 0/10791 [00:00<?, ?it/s]

  0%|          | 0/10293 [00:00<?, ?it/s]

  0%|          | 0/10253 [00:00<?, ?it/s]

  0%|          | 0/10460 [00:00<?, ?it/s]

  0%|          | 0/10642 [00:00<?, ?it/s]

  0%|          | 0/10561 [00:00<?, ?it/s]

  0%|          | 0/10225 [00:00<?, ?it/s]

  0%|          | 0/10125 [00:00<?, ?it/s]

  0%|          | 0/10194 [00:00<?, ?it/s]

  0%|          | 0/10123 [00:00<?, ?it/s]

  0%|          | 0/10345 [00:00<?, ?it/s]

  0%|          | 0/10598 [00:00<?, ?it/s]

  0%|          | 0/10436 [00:00<?, ?it/s]

  0%|          | 0/10234 [00:00<?, ?it/s]

  0%|          | 0/9948 [00:00<?, ?it/s]

  0%|          | 0/9906 [00:00<?, ?it/s]

  0%|          | 0/10316 [00:00<?, ?it/s]

  0%|          | 0/10051 [00:00<?, ?it/s]

  0%|          | 0/10348 [00:00<?, ?it/s]

  0%|          | 0/10158 [00:00<?, ?it/s]

  0%|          | 0/10498 [00:00<?, ?it/s]

  0%|          | 0/10022 [00:00<?, ?it/s]

  0%|          | 0/9993 [00:00<?, ?it/s]

  0%|          | 0/9757 [00:00<?, ?it/s]

  0%|          | 0/9957 [00:00<?, ?it/s]

  0%|          | 0/9909 [00:00<?, ?it/s]

  0%|          | 0/8902 [00:00<?, ?it/s]

In [None]:
forlemtxts = [fname for fname in files for _, _, files in os.walk('.') if fname.startswith('filtered')]
for filename in forlemtxts:
    lemmatized = filename[7:] 
    os.system(f'mystem.exe {filename} {lemmatized} -c -l -d')
    poemclean(lemmatized, lemmatized[:-3]+'tsv')    