# Training fastText

## Corpus parsing

In [1]:
# import requests
# from tqdm import tqdm
# from time import sleep
# from lxml import etree, html

In [2]:
# links = []

# for topic in [33, 37, 10, 13, 36, 12, 34, 39]:
#     for letter in 'абвгдезиклмнпростуфхцшщчуэюя':
#         for page_num in range(30):
#             r = requests.get(f'https://www.krugosvet.ru/taxonomy/term/{topic}/{letter}?page={page_num}')
#             if r.status_code != 200:
#                 continue
#             page = html.fromstring(r.text)
#             hrefs = page.xpath("//div[@class='article-teaser']/a/@href")
#             links += ['https://www.krugosvet.ru'+href for href in hrefs]
            
#     print(f'After topic: {topic}, links collected: {len(links)}.')

In [3]:
# texts = []

# for link in tqdm(links):
#     r = requests.get(link)
#     if r.status_code != 200:
#         continue
#     page = html.fromstring(r.text)
#     text = ''.join(page.xpath("//div[@id='article-content']/div[@class='body']//text()"))
#     texts.append(text)

In [4]:
# with open('corpus_hum.txt', 'w') as f:
#   for text in texts:
#       if len(text) > 10:
#           f.write(text + '\n')

## Corpus reading

In [5]:
from typing import List, Tuple, Dict

import re
import nltk
import numpy as np
from string import punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()
punctuation += '«»—…“”*№–'

stopwords = set(stopwords.words('russian'))

In [6]:
class Lemmatizer:
    def __init__(self):
        self.morph = MorphAnalyzer()
        self.cache = {}
    
    def lemmatize(self, token: str) -> str:
        if token in self.cache:
            return self.cache[token]
        norm = self.morph.parse(token)[0].normal_form
        self.cache[token] = norm
        return norm

lemmatizer = Lemmatizer()

def normalize(text: str) -> List[str]:
    output = []
    for token in word_tokenize(text.lower()):
        token = token.strip(punctuation)
        if not token:
            continue
            
        lemma = lemmatizer.lemmatize(token)
        if lemma in stopwords:
            continue
            
        output.append(lemma)
    return output

In [7]:
# with open('corpus_hum.txt', 'r') as f:
#     corpus = f.read().splitlines()

# corpus_norm = [normalize(text) for text in corpus]
# corpus_norm = [text for text in corpus_norm if text]

In [8]:
# corpus_norm[0][:10]

In [9]:
# len(corpus_norm)

## Training the fastText model

In [10]:
# import warnings
# warnings.filterwarnings('ignore')

# from gensim.models import FastText

# ft = FastText(corpus_norm, size=300, sg=1, negative=20, workers=4)
# ft2 = FastText(corpus_norm, size=300, sg=1, negative=5, workers=4)
# ft3 = FastText(corpus_norm, size=300, hs=1, workers=4)

In [11]:
# ft.save('ft/fasttext_hum.model')
# ft2.save('ft/fasttext_hum_2.model')
# ft3.save('ft/fasttext_hum_3.model')

# Downstream task: paraphrases

## Loading the trained models

In [12]:
from gensim.models import Word2Vec, FastText

w2v = Word2Vec.load('w2v/word2vec_hum.model')
ft = FastText.load('ft/fasttext_hum.model')
ft2 = FastText.load('ft/fasttext_hum_2.model')
ft3 = FastText.load('ft/fasttext_hum_3.model')

In [13]:
from gensim.models import KeyedVectors

w2v2 = KeyedVectors.load_word2vec_format(
    'news_upos_skipgram_300_5_2019/model.bin', binary=True
)

## Corpus parsing

In [14]:
def read_corpus(path: str) -> Dict[int, str]:
    with open(path, 'r', encoding='utf-8') as f:
        xml = f.read()
    corpus = {}
    text_pattern = re.compile(r'(?<=<value name="text">).*?(?=</value>)', re.DOTALL)
    id_pattern = re.compile(r'(?<=<value name="id">).*?(?=</value>)', re.DOTALL)
    sentence_pattern = re.compile(r'(?<=<sentence>).*?(?=</sentence>)', re.DOTALL)
    for sentence in re.findall(sentence_pattern, xml):
        corpus[int(re.search(id_pattern, sentence).group())] = \
            re.search(text_pattern, sentence).group()
    return corpus

def read_markup(path: str) -> List[Tuple[int]]:
    with open(path, 'r', encoding='utf-8') as f:
        xml = f.read()
    markup = []
    id_1_pattern = re.compile(r'(?<=<value name="id_1">).*?(?=</value>)', re.DOTALL)
    id_2_pattern = re.compile(r'(?<=<value name="id_2">).*?(?=</value>)', re.DOTALL)
    class_pattern = re.compile(r'(?<=<value name="class">).*?(?=</value>)', re.DOTALL)
    paraphrase_pattern = re.compile(r'(?<=<paraphrase>).*?(?=</paraphrase>)', re.DOTALL)
    for paraphrase in re.findall(paraphrase_pattern, xml):
        markup.append((int(re.search(id_1_pattern, paraphrase).group()),
                       int(re.search(id_2_pattern, paraphrase).group()),
                       int(re.search(class_pattern, paraphrase).group())))
    return markup

In [15]:
corpus = read_corpus('paraphraser/corpus.xml')
markup = read_markup('paraphraser/paraphrases.xml')

In [16]:
vocab = set()
corpus_norm = {}

for key, value in corpus.items():
    tokens = normalize(value)
    corpus_norm[key] = tokens
    for token in tokens:
        vocab.add(token)

In [17]:
len(vocab)

10508

In [18]:
mapping = {}

with open('ru-rnc.map.txt', 'r') as f:
    for line in f:
        ms, ud = line.strip('\n').split()
        mapping[ms] = ud

In [19]:
mapping

{'A': 'ADJ',
 'ADV': 'ADV',
 'ADVPRO': 'ADV',
 'ANUM': 'ADJ',
 'APRO': 'DET',
 'COM': 'ADJ',
 'CONJ': 'SCONJ',
 'INTJ': 'INTJ',
 'NONLEX': 'X',
 'NUM': 'NUM',
 'PART': 'PART',
 'PR': 'ADP',
 'S': 'NOUN',
 'SPRO': 'PRON',
 'UNKN': 'X',
 'V': 'VERB'}

In [20]:
from pymystem3 import Mystem
m = Mystem()

def normalize_mystem(text: str) -> List[str]:
    tokens = []
    norm_words = m.analyze(text)
    
    for norm_word in norm_words:
        if 'analysis' not in norm_word:
            continue
            
        if not norm_word['analysis']:
            lemma = norm_word['text']
            pos = 'UNKN'
        else:
            lemma = norm_word['analysis'][0]['lex'].lower().strip()
            pos = norm_word['analysis'][0]['gr'].split(',')[0].split('=')[0].strip()
        if lemma in stopwords:
            continue
        pos = mapping[pos]
        tokens.append(lemma + '_' + pos)
        
    return tokens

In [21]:
vocab_ms = set()
corpus_ms = {}

for key, value in corpus.items():
    tokens = normalize_mystem(value)
    corpus_ms[key] = tokens
    for token in tokens:
        vocab_ms.add(token)

In [22]:
len(vocab_ms)

9671

In [23]:
w2v_oov = np.mean([w2v.wv[token] for token in vocab if token in w2v.wv], axis=0)
w2v2_oov = np.mean([w2v2.wv[token] for token in vocab_ms if token in w2v2.wv], axis=0)

ft_oov = np.mean([ft.wv[token] for token in vocab if token in ft.wv], axis=0)
ft2_oov = np.mean([ft2.wv[token] for token in vocab if token in ft2.wv], axis=0)
ft3_oov = np.mean([ft3.wv[token] for token in vocab if token in ft3.wv], axis=0)

  


In [24]:
def vectorize(corpus: List[List[str]], model, oov: np.ndarray) -> np.ndarray:
    output = []
    
    for sentence in corpus:
        if not sentence:
            continue
            
        output.append([])
        for token in sentence:
            embedding = model.wv[token] if token in model.wv else oov
            output[-1].append(embedding)
        output[-1] = np.average(output[-1], axis=0)
        
    return np.vstack(output)

In [25]:
y = np.array([x[2] for x in markup])
y.shape

(7227,)

## Matrix decomposition embeddings

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(x):
    return x

cv = CountVectorizer(max_features=100000, preprocessor=dummy, tokenizer=dummy)
X_cv = cv.fit_transform(corpus_norm.values())

In [27]:
X_cv.shape

(12062, 10508)

In [28]:
from sklearn.decomposition import TruncatedSVD, NMF

nmf = NMF(300)
nmf.fit(X_cv)

svd = TruncatedSVD(300)
svd.fit(X_cv)

id2word = {i: w for i, w in enumerate(cv.get_feature_names())}
word2id = {w: i for i, w in id2word.items()}

id2vec_svd = nmf.components_.T
id2vec_nmf = svd.components_.T

In [29]:
id2vec_svd.shape

(10508, 300)

In [30]:
id2vec_nmf.shape

(10508, 300)

## Distance matrix creation

In [31]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics.pairwise import cosine_similarity

X = np.zeros((y.shape[0], 7))

for i, (sent1_id, sent2_id, _) in enumerate(markup):
    sent1, sent2 = corpus_norm[sent1_id], corpus_norm[sent2_id]
    
    sent1_w2v, sent2_w2v = vectorize([sent1, sent2], w2v, w2v_oov)
    X[i, 0] = cosine_similarity(sent1_w2v.reshape(1, -1), sent2_w2v.reshape(1, -1))
    
    sent1_nmf = np.mean([id2vec_nmf[word2id[word]] for word in sent1], axis=0)
    sent2_nmf = np.mean([id2vec_nmf[word2id[word]] for word in sent2], axis=0)
    X[i, 2] = cosine_similarity(sent1_nmf.reshape(1, -1), sent2_nmf.reshape(1, -1))
    
    sent1_svd = np.mean([id2vec_svd[word2id[word]] for word in sent1], axis=0)
    sent2_svd = np.mean([id2vec_svd[word2id[word]] for word in sent2], axis=0)
    X[i, 3] = cosine_similarity(sent1_svd.reshape(1, -1), sent2_svd.reshape(1, -1))
    
    sent1_ft, sent2_ft = vectorize([sent1, sent2], ft, ft_oov)
    X[i, 4] = cosine_similarity(sent1_ft.reshape(1, -1), sent2_ft.reshape(1, -1))
    
    sent1_ft2, sent2_ft2 = vectorize([sent1, sent2], ft2, ft2_oov)
    X[i, 5] = cosine_similarity(sent1_ft2.reshape(1, -1), sent2_ft2.reshape(1, -1))
    
    sent1_ft3, sent2_ft3 = vectorize([sent1, sent2], ft3, ft3_oov)
    X[i, 6] = cosine_similarity(sent1_ft3.reshape(1, -1), sent2_ft3.reshape(1, -1))
    
    sent1, sent2 = corpus_ms[sent1_id], corpus_ms[sent2_id]
    sent1_w2v2, sent2_w2v2 = vectorize([sent1, sent2], w2v2, w2v2_oov)
    X[i, 1] = cosine_similarity(sent1_w2v2.reshape(1, -1), sent2_w2v2.reshape(1, -1))

## Logistic regression #1

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression()
scores = cross_val_score(logreg, X[:, :5], y, cv=10, scoring='f1_micro')

In [33]:
np.mean(scores)

0.5512249568469924

## Logistic regression #2

In [34]:
scores = cross_val_score(logreg, X[:, [0, 1, 2, 3, 5]], y, cv=10, scoring='f1_micro')
np.mean(scores)

0.5508094427874269

## Logistic regression #3

In [35]:
scores = cross_val_score(logreg, X[:, [0, 1, 2, 3, 6]], y, cv=10, scoring='f1_micro')
np.mean(scores)

0.570184730316728

We can conclude that using different cosine distances between the same pair of texts vectorized with 5 different algorithms leads to a better f1 metric than using concatenated means of word vectors. Out of the three fastText models we trained, the third one, which uses hierarchical softmax, is the best one. The two models trained using negative sampling perform roughly the same.