#### Для Colab

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
%pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/431.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m399.4/431.4 kB[0m [31m12.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji
Successfully installed emoji-2.12.1


#### Подключение библиотек

In [None]:
import pandas as pd
import numpy as np

from time import time
from tqdm import tqdm
from gensim.models import Word2Vec
from emoji import is_emoji
from string import punctuation
from multiprocessing import cpu_count
from functools import lru_cache

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize

from gensim.models.word2vec import Word2Vec

from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.nn.parameter import Parameter

## Обработка датасета

In [None]:
df = pd.read_csv('train_reviews.csv', index_col=0)
df.head()

Unnamed: 0,Reviews
0,"Я ждал, когда пройду 80% курса, чтобы написать..."
1,"Пока что трудно для понимания, так как я абсол..."
2,Очень интересный курс. 4 звезды из 5 поставила...
3,"Понравилась структура курса, заострение вниман..."
4,Неплохой курс для начала изучения С/С++. Лекто...


In [None]:
df.dropna(inplace=True)

In [None]:
def rem_emoji(s: str):
    i = 0
    while i < len(s):
        if is_emoji(s[i]):
            s = s[:i] + s[i + 1:]
        else: i += 1
    if not s: return np.nan
    return s

df = df['Reviews'].apply(rem_emoji)

In [None]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
tokenizer = TweetTokenizer(preserve_case=False)
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words("russian"))

In [None]:
@lru_cache(maxsize=1000000000)
def lemmatize(w: str):
    # caching the word-based lemmatizer to speed the process up
    return lemmatizer.lemmatize(w)

In [None]:
newdf = []

for text in tqdm(df):
    # splitting into sentences
    sentences = sent_tokenize(text)
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]

    # removing stopwords and non-alphanumeric tokens
    lemmatized_sentences = [[lemmatize(word) for word in s if not word in stops and str.isalpha(word)]
                            for s in tokenized_sentences]

    for sentence in lemmatized_sentences:
        newdf.append(" ".join(sentence))
df = newdf.copy()

100%|██████████| 4091/4091 [00:06<00:00, 637.36it/s] 


In [None]:
def read_data_batches(data, batch_size: int=50, minlength: int=5):
    batch = []

    for line in data:
        line = line.strip().split()

        if len(line) >= minlength:
            batch.append(line)
            if len(batch) >= batch_size:
                yield batch
                batch = []

    if len(batch) > 0:
        yield batch

In [None]:
def text2vectors(text: list, w2v_model, maxlen: int, vocabulary):
    acc_vecs = []

    for word in text:
        if word in w2v_model.wv and (vocabulary is None or word in vocabulary):
            acc_vecs.append(w2v_model.wv[word])

    if len(acc_vecs) < maxlen:
        acc_vecs.extend([np.zeros(w2v_model.vector_size)] * (maxlen - len(acc_vecs)))

    return acc_vecs

In [None]:
data = []
for sentence in df:
    data.append(sentence.split())

w2v_model = Word2Vec(data, vector_size=200, window=10, min_count=0, workers=cpu_count())

In [None]:
def read_data_tensors(series, w2v_model,
                      batch_size=50, vocabulary=None,
                      maxlen=100, min_sent_length=5):

    for batch in read_data_batches(series, batch_size, min_sent_length):
        batch_vecs = []
        batch_texts = []

        for text in batch:
            vectors_as_list = text2vectors(text, w2v_model, maxlen, vocabulary)
            batch_vecs.append(np.asarray(vectors_as_list[:maxlen], dtype=np.float32))
            batch_texts.append(text)

        yield np.stack(batch_vecs, axis=0), batch_texts

In [None]:
def vector2text(w2v_model: Word2Vec, vector):
    return ' '.join([w2v_model.wv.most_similar(positive=i)[0][0] for i in vector])

In [None]:
def get_centroids(w2v_model, aspects_count):
    """
        Clustering all word vectors with K-means and returning L2-normalizes
        cluster centroids; used for ABAE aspects matrix initialization
    """

    km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
    m = []

    for k in w2v_model.wv.key_to_index:
        m.append(w2v_model.wv[k])

    m = np.array(m)
    km.fit(m)
    clusters = km.cluster_centers_

    # L2 normalization
    norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)

    return norm_aspect_matrix

## Построение модели

### Слои

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, wv_dim: int, maxlen: int):
        super(SelfAttention, self).__init__()
        self.wv_dim = wv_dim

        # max sentence length -- batch 2nd dim size
        self.maxlen = maxlen
        self.M = Parameter(torch.empty(size=(wv_dim, wv_dim)))
        nn.init.kaiming_uniform_(self.M.data)

        # softmax for attending to wod vectors
        self.attention_softmax = nn.Softmax(dim=-1)

    def forward(self, input_embeddings):
        # (b, wv, 1)
        mean_embedding = torch.mean(input_embeddings, (1,)).unsqueeze(2)

        # (wv, wv) x (b, wv, 1) -> (b, wv, 1)
        product_1 = torch.matmul(self.M, mean_embedding)

        # (b, maxlen, wv) x (b, wv, 1) -> (b, maxlen, 1)
        product_2 = torch.matmul(input_embeddings, product_1).squeeze(2)

        results = self.attention_softmax(product_2)

        return results

    def extra_repr(self):
        return 'wv_dim={}, maxlen={}'.format(self.wv_dim, self.maxlen)

In [None]:
class ABAE(nn.Module):
    """
        The model described in the paper ``An Unsupervised Neural Attention Model for Aspect Extraction''
        by He, Ruidan and  Lee, Wee Sun  and  Ng, Hwee Tou  and  Dahlmeier, Daniel, ACL2017
        https://aclweb.org/anthology/papers/P/P17/P17-1036/

    """

    def __init__(self, wv_dim: int = 200, asp_count: int = 5,
                 ortho_reg: float = 0.1, maxlen: int = 201, init_aspects_matrix=None):
        """
        Initializing the model

        :param wv_dim: word vector size
        :param asp_count: number of aspects
        :param ortho_reg: coefficient for tuning the ortho-regularizer's influence
        :param maxlen: sentence max length taken into account
        :param init_aspects_matrix: None or init. matrix for aspects
        """
        super(ABAE, self).__init__()
        self.wv_dim = wv_dim
        self.asp_count = asp_count
        self.ortho = ortho_reg
        self.maxlen = maxlen

        self.attention = SelfAttention(wv_dim, maxlen)
        self.linear_transform = nn.Linear(self.wv_dim, self.asp_count)
        self.softmax_aspects = nn.Softmax(dim=-1)
        self.aspects_embeddings = Parameter(torch.empty(size=(wv_dim, asp_count)))

        if init_aspects_matrix is None:
            nn.init.xavier_uniform(self.aspects_embeddings)
        else:
            self.aspects_embeddings.data = torch.from_numpy(init_aspects_matrix.T).to('cuda')

    def get_aspects_importances(self, text_embeddings):
        """
            Takes embeddings of a sentence as input, returns attention weights
        """

        # compute attention scores, looking at text embeddings average
        attention_weights = self.attention(text_embeddings)

        # multiplying text embeddings by attention scores -- and summing
        # (matmul: we sum every word embedding's coordinate with attention weights)
        weighted_text_emb = torch.matmul(attention_weights.unsqueeze(1),  # (batch, 1, sentence)
                                         text_embeddings  # (batch, sentence, wv_dim)
                                         ).squeeze()

        # encoding with a simple feed-forward layer (wv_dim) -> (aspects_count)
        raw_importances = self.linear_transform(weighted_text_emb)

        # computing 'aspects distribution in a sentence'
        aspects_importances = self.softmax_aspects(raw_importances)

        return attention_weights, aspects_importances, weighted_text_emb

    def forward(self, text_embeddings, negative_samples_texts):
        # negative samples are averaged
        averaged_negative_samples = torch.mean(negative_samples_texts, dim=2)

        # encoding: words embeddings -> sentence embedding, aspects importances
        _, aspects_importances, weighted_text_emb = self.get_aspects_importances(text_embeddings)

        # decoding: aspects embeddings matrix, aspects_importances -> recovered sentence embedding
        recovered_emb = torch.matmul(self.aspects_embeddings, aspects_importances.unsqueeze(2)).squeeze()

        # loss
        reconstruction_triplet_loss = ABAE._reconstruction_loss(weighted_text_emb,
                                                                recovered_emb,
                                                                averaged_negative_samples)
        max_margin = torch \
            .max(reconstruction_triplet_loss, torch.zeros_like(reconstruction_triplet_loss)) \
            .unsqueeze(dim=-1)

        return self.ortho * self._ortho_regularizer() + max_margin

    @staticmethod
    def _reconstruction_loss(text_emb, recovered_emb, averaged_negative_emb):

        positive_dot_products = torch.matmul(text_emb.unsqueeze(1), recovered_emb.unsqueeze(2)).squeeze()
        negative_dot_products = torch.matmul(averaged_negative_emb, recovered_emb.unsqueeze(2)).squeeze()
        reconstruction_triplet_loss = torch.sum(1 - positive_dot_products.unsqueeze(1), dim=1)

        return reconstruction_triplet_loss

    def _ortho_regularizer(self):
        return torch.norm(
            torch.matmul(self.aspects_embeddings.t(), self.aspects_embeddings) \
            - torch.eye(self.asp_count).to('cuda'))

    def get_aspect_words(self, w2v_model: Word2Vec, topn=15):
        words = []

        # getting aspects embeddings
        aspects = self.aspects_embeddings.cpu().detach().numpy().T

        # getting scalar products of word embeddings and aspect embeddings;
        # to obtain the ``probabilities'', one should also apply softmax
        # words_scores = w2v_model.wv.syn0.dot(aspects)
        # words_scores = w2v_model.wv.vectors.dot(aspects)

        # for row in range(aspects.shape[1]):
        #     argmax_scalar_products = np.argsort(-words_scores[:, row])[:topn]
        #     # print([w for w, dist in w2v_model.wv.similar_by_vector(aspects.T[row])[:topn]])
        #     words.append([w2v_model.wv.index_to_key[i] for i in argmax_scalar_products])

        for aspect in aspects:
            words.append(w2v_model.wv.most_similar(aspect))

        return words

### Обучение

In [None]:
batch_size = 50
epochs = 100
max_len = max([len(s.split()) for s in df])
neg = 20
log_progress_steps = 1
aspects_number = 5

In [None]:
wv_dim = w2v_model.vector_size
y = torch.zeros((batch_size, 1)).to('cuda')

model = ABAE(wv_dim=wv_dim,
             asp_count=aspects_number,
             maxlen=max_len,
             init_aspects_matrix=np.array([w2v_model.wv[word] for word in ['задания', 'теория', 'преподаватель', 'технологии', 'актуальность']]),
             ortho_reg=1).to('cuda')

criterion = nn.MSELoss(reduction="sum")

optimizer = torch.optim.Adam(model.parameters())

for t in range(epochs):
    print("Epoch %d/%d" % (t + 1, epochs))
    data_iterator = read_data_tensors(df, w2v_model, batch_size=batch_size, maxlen=max_len)

    for item_number, (x, texts) in enumerate(data_iterator):
        if x.shape[0] < batch_size:  # pad with 0 if smaller than batch size
            x = np.pad(x, ((0, batch_size - x.shape[0]), (0, 0), (0, 0)))

        x = torch.from_numpy(x).to('cuda')

        # extracting bad samples from the very same batch; not sure if this is OK, so todo
        negative_samples = torch.stack(
            tuple([x[torch.randperm(x.shape[0])[:neg]]
                    for _ in range(batch_size)])).to('cuda')

        # prediction
        y_pred = model(x, negative_samples)

        # error computation
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print("%d batches, and LR: %.5f" % (item_number, optimizer.param_groups[0]['lr']))
    for i, aspect in enumerate(model.get_aspect_words(w2v_model)):
        print("[%d] %s" % (i + 1, ' '.join([word[0] for word in aspect])))

    print("Loss: %.4f" % loss.item())

Epoch 1/100
[1] понимаю ответа одной статистику ещё комментариев лекции которая проходил начал
[2] комментариев ответа вывод удовольствие одной интересный понимаю изложение тестов статистику
[3] прикладной бусти вывод куски силы тимур комьюнити статьи полученных ответить
[4] достоен усвояемый огненный молчу насыщенный добавите политика жалеть уурс невозможного
[5] актуальность завершен re поздних подтянула дмитрию покороче ирина интереснейших темном
Loss: 67362.5781
Epoch 2/100
[1] комментариев ответа понимаю статистику одной которая проходил удовольствие изложение желаю
[2] комьюнити вступление начинающего собой науке случайных душ графиков искал понимает
[3] обьясняют неизвестных интерфейсов устройства раскрытия порядку г задумываться каждое старания
[4] усвояемый политика молчу уурс достоен оторванность жалеть приложенные усталость сверхподробный
[5] актуальность re завершен поздних дмитрию интереснейших подтянула планку покороче повторишь
Loss: 35014.0117
Epoch 3/100
[1] комментари

KeyboardInterrupt: 

In [None]:
model.get_aspect_words(w2v_model)

['переменные', 'позновательный', 'крутейшие', 'политика', 'актуальность']

In [None]:
w2v_model.wv.vectors.shape

(12155, 200)

In [None]:
max([len(s.split()) for s in df])

166

In [None]:
vector2text(w2v_model, x[5].cpu().detach().numpy())

'александр михаил спасибо курс вашу работу вообще научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научными научн

In [None]:
attention_weights, aspects_importances, weighted_text_emb = [a.cpu().detach().numpy() for a in model.get_aspects_importances(x[5:6])]
attention_weights

array([[8.2009927e-07, 4.5371456e-07, 2.2484665e-04, 9.8814535e-01,
        1.3403793e-06, 1.5802808e-04, 1.1401154e-02, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e-07, 4.2676632e-07,
        4.2676632e-07, 4.2676632e-07, 4.2676632e