# Training word2vec

## Corpus parsing

In [1]:
# import requests
# from tqdm import tqdm
# from time import sleep
# from lxml import etree, html

In [2]:
# links = []

# for topic in [33, 37, 10, 13, 36, 12, 34, 39]:
#     for letter in 'абвгдезиклмнпростуфхцшщчуэюя':
#         for page_num in range(30):
#             r = requests.get(f'https://www.krugosvet.ru/taxonomy/term/{topic}/{letter}?page={page_num}')
#             if r.status_code != 200:
#                 continue
#             page = html.fromstring(r.text)
#             hrefs = page.xpath("//div[@class='article-teaser']/a/@href")
#             links += ['https://www.krugosvet.ru'+href for href in hrefs]
            
#     print(f'After topic: {topic}, links collected: {len(links)}.')

In [3]:
# texts = []

# for link in tqdm(links):
#     r = requests.get(link)
#     if r.status_code != 200:
#         continue
#     page = html.fromstring(r.text)
#     text = ''.join(page.xpath("//div[@id='article-content']/div[@class='body']//text()"))
#     texts.append(text)

In [4]:
# with open('corpus_hum.txt', 'w') as f:
#   for text in texts:
#       if len(text) > 10:
#           f.write(text + '\n')

## Corpus reading

In [5]:
from typing import List, Tuple, Dict

import re
import nltk
import numpy as np
from string import punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()
punctuation += '«»—…“”*№–'

stopwords = set(stopwords.words('russian'))

In [6]:
class Lemmatizer:
    def __init__(self):
        self.morph = MorphAnalyzer()
        self.cache = {}
    
    def lemmatize(self, token: str) -> str:
        if token in self.cache:
            return self.cache[token]
        norm = self.morph.parse(token)[0].normal_form
        self.cache[token] = norm
        return norm

lemmatizer = Lemmatizer()

def normalize(text: str) -> List[str]:
    output = []
    for token in word_tokenize(text.lower()):
        token = token.strip(punctuation)
        if not token:
            continue
            
        lemma = lemmatizer.lemmatize(token)
        if lemma in stopwords:
            continue
            
        output.append(lemma)
    return output

In [3]:
with open('corpus_hum.txt', 'r') as f:
    corpus = f.read().splitlines()

corpus_norm = [normalize(text) for text in corpus]
corpus_norm = [text for text in corpus_norm if text]

In [4]:
corpus_norm[0][:10]

['абай',
 'василий',
 'васо',
 'иван',
 '1900–2001',
 'русский',
 'лингвист',
 'родиться',
 '2',
 '15']

In [5]:
len(corpus_norm)

115075

## Training the word2vec model

In [6]:
# from gensim.models import Word2Vec

# w2v = Word2Vec(corpus_norm, size=300, sg=1, negative=20, workers=4)

In [7]:
# w2v.save('w2v/word2vec_hum.model')

In [8]:
from gensim.models import Word2Vec

w2v = Word2Vec.load('w2v/word2vec_hum.model')

# Downstream task: paraphrases

## Corpus parsing

In [7]:
def read_corpus(path: str) -> Dict[int, str]:
    with open(path, 'r', encoding='utf-8') as f:
        xml = f.read()
    corpus = {}
    text_pattern = re.compile(r'(?<=<value name="text">).*?(?=</value>)', re.DOTALL)
    id_pattern = re.compile(r'(?<=<value name="id">).*?(?=</value>)', re.DOTALL)
    sentence_pattern = re.compile(r'(?<=<sentence>).*?(?=</sentence>)', re.DOTALL)
    for sentence in re.findall(sentence_pattern, xml):
        corpus[int(re.search(id_pattern, sentence).group())] = \
            re.search(text_pattern, sentence).group()
    return corpus

def read_markup(path: str) -> List[Tuple[int]]:
    with open(path, 'r', encoding='utf-8') as f:
        xml = f.read()
    markup = []
    id_1_pattern = re.compile(r'(?<=<value name="id_1">).*?(?=</value>)', re.DOTALL)
    id_2_pattern = re.compile(r'(?<=<value name="id_2">).*?(?=</value>)', re.DOTALL)
    class_pattern = re.compile(r'(?<=<value name="class">).*?(?=</value>)', re.DOTALL)
    paraphrase_pattern = re.compile(r'(?<=<paraphrase>).*?(?=</paraphrase>)', re.DOTALL)
    for paraphrase in re.findall(paraphrase_pattern, xml):
        markup.append((int(re.search(id_1_pattern, paraphrase).group()),
                       int(re.search(id_2_pattern, paraphrase).group()),
                       int(re.search(class_pattern, paraphrase).group())))
    return markup

In [8]:
corpus = read_corpus('paraphraser/corpus.xml')
markup = read_markup('paraphraser/paraphrases.xml')

In [11]:
vocab = set()
corpus_norm = {}

for key, value in corpus.items():
    tokens = normalize(value)
    corpus_norm[key] = tokens
    for token in tokens:
        vocab.add(token)

In [12]:
len(vocab)

10509

In [13]:
oov = np.mean([w2v.wv[token] for token in vocab if token in w2v.wv], axis=0)
oov.shape

(300,)

In [14]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# def dummy(x):
#     return x

# tfidf_vectorizer = TfidfVectorizer(analyzer='word', preprocessor=dummy,
#                                    tokenizer=dummy, ngram_range=(1, 1),
#                                    max_df = 1.0, min_df = 0)
# tfidf_vectorizer.fit(corpus_norm.values())

# token2idf = {}
# for i, token in enumerate(tfidf_vectorizer.get_feature_names()):
#     token2idf[token] = tfidf_vectorizer.idf_[i]

In [9]:
# from scipy.special import softmax

def vectorize(corpus: List[List[str]]) -> np.ndarray:
    output = []
    
    for sentence in corpus:
        if not sentence:
            continue
            
        idfs = []
        output.append([])
        for token in sentence:
            # idfs.append(token2idf[token])
            embedding = w2v.wv[token] if token in w2v.wv else oov
            output[-1].append(embedding)
        output[-1] = np.average(output[-1], axis=0) #, weights=softmax(idfs))
        
    return np.vstack(output)

P. S. I decided to not use the tf-idf weights after all because it made the f-score worse for some reason.

In [16]:
X = np.hstack((
    vectorize([corpus_norm[x[0]] for x in markup]),
    vectorize([corpus_norm[x[1]] for x in markup])
))
X.shape

(7227, 600)

In [17]:
y = np.array([x[2] for x in markup])
y.shape

(7227,)

## Logistic regression

In [18]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression()
scores = cross_val_score(logreg, X, y, cv=10, scoring='f1_micro')

In [19]:
np.mean(scores)

0.4374850983363685

# External pretrained embeddings for paraphrase identification

In [10]:
from gensim.models import KeyedVectors

w2v = KeyedVectors.load_word2vec_format(
    'news_upos_skipgram_300_5_2019/model.bin', binary=True
)

In [11]:
mapping = {}

with open('ru-rnc.map.txt', 'r') as f:
    for line in f:
        ms, ud = line.strip('\n').split()
        mapping[ms] = ud

In [12]:
mapping

{'A': 'ADJ',
 'ADV': 'ADV',
 'ADVPRO': 'ADV',
 'ANUM': 'ADJ',
 'APRO': 'DET',
 'COM': 'ADJ',
 'CONJ': 'SCONJ',
 'INTJ': 'INTJ',
 'NONLEX': 'X',
 'NUM': 'NUM',
 'PART': 'PART',
 'PR': 'ADP',
 'S': 'NOUN',
 'SPRO': 'PRON',
 'UNKN': 'X',
 'V': 'VERB'}

In [13]:
from pymystem3 import Mystem
m = Mystem()

def normalize_mystem(text: str) -> List[str]:
    tokens = []
    norm_words = m.analyze(text)
    
    for norm_word in norm_words:
        if 'analysis' not in norm_word or not norm_word['analysis']:
            continue
        lemma = norm_word['analysis'][0]['lex'].lower().strip()
        if lemma in stopwords:
            continue
        pos = norm_word['analysis'][0]['gr'].split(',')[0].split('=')[0].strip()
        pos = mapping[pos]
        tokens.append(lemma + '_' + pos)
        
    return tokens

In [14]:
vocab = set()
corpus_norm = {}

for key, value in corpus.items():
    tokens = normalize_mystem(value)
    corpus_norm[key] = tokens
    for token in tokens:
        vocab.add(token)

In [15]:
len(vocab)

9211

In [16]:
import warnings
warnings.filterwarnings('ignore')

oov = np.mean([w2v.wv[token] for token in vocab if token in w2v.wv], axis=0)
oov.shape

(300,)

In [17]:
X = np.hstack((
    vectorize([corpus_norm[x[0]] for x in markup]),
    vectorize([corpus_norm[x[1]] for x in markup])
))
X.shape

(7227, 600)

In [18]:
y = np.array([x[2] for x in markup])
y.shape

(7227,)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression()
scores = cross_val_score(logreg, X, y, cv=10, scoring='f1_micro')

In [20]:
np.mean(scores)

0.4236451618490908

We can see from the results that the heavier embeddings precomputed on a large corpus work, surprisingly, a little worse on this downstream task, but the difference is not dramatic. The embeddings are trained on news data, and in the downstream task, the examples are drawn from news as well, while our own embeddings are trained on humanitarian articles. It is difficult to say why our embeddings perform better on this task compared to the external embeddings — perhaps it has something to do with the choice of the hyperparameters?

Nevertheless, it is clear that a difficult semantic task like this one should be solved with more complex architectures: neural networks. Logistic regression on top of the means of word embeddings just isn't good enough.