In [1]:
import pandas as pd
import os
from collections import Counter
from lxml import etree
from tqdm import tqdm_notebook, tqdm
import gensim
from gensim.models import Word2Vec
import re
import xml.etree.cElementTree as ET
from pytorch_pretrained_bert import BertTokenizer, BertModel, BasicTokenizer
from xml.dom import minidom
import sklearn
from sklearn import metrics
import operator
import pymorphy2
import json
import copy
import nltk
import pickle
morph = pymorphy2.MorphAnalyzer()

In [2]:
"""
Loading dictionary with all monosemous nouns
"""
with open(r'monosemous_words.pkl', 'rb') as f:
    mono_dict_all = pickle.load(f)

# Processing of a Corpus

In [None]:
"""
Opening file with raw corpus
"""
path_to_all_texts = r'Taiga_1billion\Taiga_1billion\taiga_all_proza_ru.txt'

with open(path_to_all_texts, 'r', encoding='utf8') as f:
    corpora = f.read()

In [None]:
"""
Loading Stopwords
"""


with open(r'stopwords.txt', 'r', encoding='utf8') as f:
    stopwords = f.readlines()
stopwords_rus = []
for i in stopwords:
    if i!='\n':
        stopwords_rus.append(i.strip('\n'))


In [None]:
def preprocessing(text_file, path_to_save):
    
    corpus = []

    sentences = nltk.sent_tokenize(text_file, language="russian")

    print('Number of Sentences: ', len(sentences))

    with open(path_to_save, 'a', encoding='utf8') as f:
        for sent in tqdm_notebook(sentences):
            brief_cleaning = re.sub("[^А-Яа-яёA-Za-z-']+", ' ', sent)
            tokens = [w.lower() for w in brief_cleaning.split() if w.lower() not in stopwords_rus]     
            txt = [morph.parse(token.lower().strip())[0].normal_form.strip(' ') for token in tokens]
        
            f.write(' '.join(txt)+'\n')
            
            corpus.append(txt)
    
    return corpus

In [None]:
documents = preprocessing(corpora, 'Taiga_1billion\Taiga_1billion\proza_ru_preproc_split.txt')

In [None]:
"""
Saving the preprocessed corpus
"""
path_to_preprocessed_corpus = r'C:/Users/Ангелина/Python_codes/processed_corpora_taiga_proza_ru.txt'
with open(path_to_preprocessed_corpus, 'w', encoding='utf8') as f:
    for doc in documents:
        f.write(' '.join(doc))

## Processing multiword expressions in corpus
### We will replace whitespace in frequent multiword expressions with the symbol "#" 

In [None]:
"""
Creating dictionaries with words from RuWordNet Thesaurus
"""
senses_file_name = r'Synsets_xml\senses.N.xml'
synsets_file_name = r'Synsets_xml\synsets.N.xml'

senses_dict = {} #mapping id - word
synset_dict = {} #mapping word - synset it belongs to
doc_N = etree.parse(senses_file_name)
root = doc_N.getroot()
for child in root:
    name = child.attrib['name'].lower()#.replace(',', '')
    if name in mono_dict_all.keys():
        senses_dict[child.attrib['id']] = child.attrib['name'].lower()
        synset_dict[child.attrib['name'].lower()] = child.attrib['synset_id']

In [None]:
synset_words = {} #mapping synset-all its words
senses_file_name_N = r'Synsets_xml\senses.N.xml'
senses_file_name_V = r'Synsets_xml\senses.V.xml'
senses_file_name_A = r'Synsets_xml\senses.A.xml'

for senses_file in [senses_file_name_N, senses_file_name_V, senses_file_name_A]:

    doc_N = etree.parse(senses_file)
    root = doc_N.getroot()
    for child in root:

        if child.attrib['synset_id'] not in synset_words.keys():
            synset_words[child.attrib['synset_id']] = []
            synset_words[child.attrib['synset_id']].append(child.attrib['name'].lower())
        else:
            synset_words[child.attrib['synset_id']].append(child.attrib['name'].lower())
    

In [None]:
"""
Creating a list with multiword keywords
"""

multiword_keys = []
for key in tqdm_notebook(synset_dict.keys()):
    if len(key.split())>1:
        multiword_keys.append(key)

In [None]:
"""
Getting lemmas from multiword keywords
"""

normal_multiword_keys = []
for word in multiword_keys:
    
    lemmas = [morph.parse(w)[0].normal_form for w in word.split()]
    lemmas_clean = [re.sub(r'[-,]+', ' ', i) for i in lemmas]
    
    new_lemmas = re.sub('( )+', ' ', ' '.join(lemmas_clean))
    normal_multiword_keys.append(new_lemmas)

In [None]:
"""
Counting the number of occurencies of multiword expressions
"""
path_to_save_counts = r'Taiga_1billion\Taiga_1billion\multiword_expressions_counts.txt'
search_results = []
s=0

for key in tqdm_notebook(normal_multiword_keys):
        
    if s % 500 == 0:
        with open(path_to_save_counts, 'w', encoding='utf8') as f:
            for i in search_results:
                for j in i:
                    f.write(j+'\n')
    
    
    multiword_pat = re.compile(r'[ |\n]'+key.strip('\n')+'[ |\n]')
    
    found_res = re.findall(multiword_pat, processed_corpus)

    if found_res:
        search_results.append(found_res)
        
    s+=1
    
with open(path_to_save_counts, 'w', encoding='utf8') as f:
    for i in search_results:
        for j in i:
            f.write(j+'\n')
            
# Converting counts to dictionary

global_counter = {}

for i in search_results:
    global_counter[i[0].strip()] = int(len(i))
sorted_x = sorted(global_counter.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
"""
Replacing frequent multiword expressions with the ones with #-sign
"""

path_to_new_corpus = r'Taiga_1billion\Taiga_1billion\proza_ru_split_processed_multiwords.txt'


for num, key in enumerate(tqdm_notebook(sorted_x)):
    
    if num%50==0:
        with open(path_to_new_corpus, 'w', encoding='utf8') as f:
            f.write(processed_corpus)
    
    keyword = key[0]
    
    keyword = re.sub(r'[-,]+', ' ', keyword)
    keyword = re.sub('( )+', ' ', keyword)
    keyword = keyword.replace(' ', '#')
        
    processed_corpus = re.sub(r'([ |\n])'+key[0]+'([ |\n])', r'\1'+keyword+r'\2', processed_corpus)

with open(path_to_new_corpus, 'w', encoding='utf8') as f:
    f.write(processed_corpus)