In [7]:
#!pip install razdel



You should consider upgrading via the 'c:\anaconda\python.exe -m pip install --upgrade pip' command.


In [1]:
import spacy
from razdel import tokenize, sentenize
import os
import pickle
import numpy as np
import gensim
import urllib.request
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
import operator
import pymorphy2
import re
import nltk
from tqdm.auto import tqdm
from typing import List


stopwords = nltk.corpus.stopwords.words("russian")
morph = pymorphy2.MorphAnalyzer()

In [2]:
pos_tag_ud = {
    'ADJF': 'ADJ',
    'ADJS': 'ADJ',

    'ADVB': 'ADV',
    'COMP': 'ADV',
    'PRED': 'ADV',

    'VERB': 'VERB',
    'GRND': 'VERB',
    'INFN': 'VERB',
    'PRTF': 'VERB',
    'PRTS': 'VERB',

    'NOUN': 'NOUN',
    'NPRO': 'PRON',

    'NUMR': 'NUM',
    'NUMB': 'NUM',

    'Apro': 'DET',
    'CONJ': 'CCONJ',
    'INTJ': 'INTJ',
    'PART': 'PRCL',
    'PNCT': 'PUNCT',
    'PRCL': 'PART',
    'PREP': 'ADP',
    
}


def preprocess(sentence: str) -> list:
    #  get tokens of russian words
    sentence = sentence.lower()
    tokens = list(tokenize(sentence))
    tokens = [_.text for _ in tokens if not re.search(r'[^а-яА-ЯёЁ]', _.text)]
    
    #  pos-tagging
    new_sent = []
    for token in tokens:
        token_p = morph.parse(token)[0]
        
        if 'UNKN' in token_p.tag:
            pos = 'UNKN'
        else:
            pos = pos_tag_ud[token_p.tag.POS]

        new_t =  f'{token_p.normal_form}_{pos}'
        if token_p.normal_form not in stopwords:
            new_sent.append(new_t)
    return new_sent


def get_all_tokens(document_name: str) -> List[List]:
    #  open file
    with open(document_name, 'r', encoding='utf-8') as f:
        document = f.read()
    
    #  separate dor sentences and get pos-tagged tokens
    sents = [_.text for _ in list(sentenize(document))]
    document_data = []
    for sent in tqdm(sents):
        tokens = preprocess(sent)
        document_data.append(tokens)
    return document_data

In [3]:
def preprocess_files(periods: list, side: ['loyal', 'opp']):   
    for period in periods:
        print(f'Processing files for year {period}')
        all_period_tokens = []  
        dir_path = f'./data/{period}/{side}/'
        
        for filename in os.listdir(dir_path):
            processed_file = get_all_tokens(dir_path+filename)  #  list of lists for every sentence
            all_period_tokens.extend(processed_file)
            
        #  write all processed sentences in one file
        processed_filename = f'./processed/{side}_{period}.txt'
        with open(processed_filename, 'w', encoding='utf-8') as file:
            for sentence in all_period_tokens:
                file.write(' '.join(sentence)+'\n')
             

In [5]:
periods = ['2015', '2016', '2017', '2018', '2019']
#periods = ['2018', '2019']


#  directory for processed files
try:
    os.mkdir('./processed/')
except FileExistsError:
    # directory already exists
    pass

preprocess_files(periods, side='loyal')

Processing files for year 2015


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=17511.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=905107.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=135972.0), HTML(value='')))


Processing files for year 2016


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18122.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=132922.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=62733.0), HTML(value='')))


Processing files for year 2017


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12883.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=208279.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=582474.0), HTML(value='')))


Processing files for year 2018


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12445.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=445826.0), HTML(value='')))


Processing files for year 2019


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60528.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=89704.0), HTML(value='')))


