In [1]:
import adagram

import re
import numpy as np
from string import punctuation
from pymorphy2 import MorphAnalyzer

from typing import List, Tuple, Dict, Iterator

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

punctuation += '«»—…“”*№–'
stopwords = set(stopwords.words('russian'))

# Corpus reading and preprocessing

In [2]:
class Lemmatizer:
    def __init__(self):
        self.morph = MorphAnalyzer()
        self.cache = {}
    
    def lemmatize(self, token):
        if token in self.cache:
            return self.cache[token]
        norm = self.morph.parse(token)[0].normal_form
        self.cache[token] = norm
        return norm

lemmatizer = Lemmatizer()

In [3]:
def normalize(text: str) -> List[str]:
    output = []
    for token in word_tokenize(text.lower()):
        token = token.strip(punctuation)
        if not token:
            continue
            
        lemma = lemmatizer.lemmatize(token)
        if lemma in stopwords:
            continue
            
        output.append(lemma)
    
    return output

In [4]:
def read_corpus(path: str) -> Dict[int, str]:
    with open(path, 'r', encoding='utf-8') as f:
        xml = f.read()
    corpus = {}
    text_pattern = re.compile(r'(?<=<value name="text">).*?(?=</value>)', re.DOTALL)
    id_pattern = re.compile(r'(?<=<value name="id">).*?(?=</value>)', re.DOTALL)
    sentence_pattern = re.compile(r'(?<=<sentence>).*?(?=</sentence>)', re.DOTALL)
    for sentence in re.findall(sentence_pattern, xml):
        corpus[int(re.search(id_pattern, sentence).group())] = \
            re.search(text_pattern, sentence).group()
    return corpus

def read_markup(path: str) -> List[Tuple[int]]:
    with open(path, 'r', encoding='utf-8') as f:
        xml = f.read()
    markup = []
    id_1_pattern = re.compile(r'(?<=<value name="id_1">).*?(?=</value>)', re.DOTALL)
    id_2_pattern = re.compile(r'(?<=<value name="id_2">).*?(?=</value>)', re.DOTALL)
    class_pattern = re.compile(r'(?<=<value name="class">).*?(?=</value>)', re.DOTALL)
    paraphrase_pattern = re.compile(r'(?<=<paraphrase>).*?(?=</paraphrase>)', re.DOTALL)
    for paraphrase in re.findall(paraphrase_pattern, xml):
        markup.append((int(re.search(id_1_pattern, paraphrase).group()),
                       int(re.search(id_2_pattern, paraphrase).group()),
                       int(re.search(class_pattern, paraphrase).group())))
    return markup

In [5]:
corpus = read_corpus('paraphraser/corpus.xml')
markup = read_markup('paraphraser/paraphrases.xml')

In [6]:
corpus_norm = {}

for key, value in corpus.items():
    tokens = normalize(value)
    corpus_norm[key] = tokens

# Adagram text vectorization

In [7]:
vm = adagram.VectorModel.load('out.pkl')

In [8]:
def get_token_embedding(token: str, context: List[str]) -> np.ndarray:
    try:
        return vm.sense_vector(token, vm.disambiguate(token, context).argmax())
    except KeyError:
        return None

In [9]:
def get_windows(tokens: List[str], window_size: int = 5) -> Iterator[Tuple[str, List[str]]]:
    return (
        (token, tokens[max(0, i-window_size):i] + tokens[i+1:i+window_size+1])
        for i, token in enumerate(tokens)
    )

In [10]:
def vectorize_sentence(tokens: List[str]) -> np.ndarray:
    token_vectors = []
    
    for token, context in get_windows(tokens):
        token_embedding = get_token_embedding(token, context)
        
        if token_embedding is not None:
            token_vectors.append(token_embedding)
    
    return np.mean(token_vectors, axis=0) if token_vectors else np.zeros(100)

In [11]:
X, y = [], []

for idx1, idx2, label in markup:
    X.append(np.hstack(
        (vectorize_sentence(corpus_norm[idx1]), vectorize_sentence(corpus_norm[idx2]))
    ))
    y.append(label)

X = np.vstack(X)
y = np.array(y)

print(X.shape, y.shape, sep='\n')

(7227, 200)
(7227,)


# Logistic Regression

In [12]:
logreg = LogisticRegression()
scores = cross_val_score(logreg, X, y, cv=10, scoring='f1_micro')

In [13]:
np.mean(scores)

0.4219869725065538

It would appear that concatenating averaged Adagram vectors doesn't work quite so well. Tweaking the window size parameter doesn't seem to help, either.