# Применяем word2vec на практике

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 3

import warnings
warnings.filterwarnings('ignore')

**gensim** не зря считают библиотекой "с человеческим лицом" для topic modeling и vector semantics.

Простой интерфейс и ПОРАЗИТЕЛЬНО высокая скорость обучения *word2vec*

In [2]:
import gensim
import numpy as np  
from gensim.models import Word2Vec

def train_word2vec(prefix, sentences, num_features=300, 
                   min_word_count=5, num_workers=4, 
                   context=10, downsampling=1e-3, save=True, sg=1):

    # обучение
    print("Training Word2Vec model...")
    
    model = Word2Vec(sentences, workers=num_workers, \
                size=num_features, min_count=min_word_count, \
                window=context, sample=downsampling, seed=1, sg=sg)

    # сделаем модель поменьше в RAM
    model.init_sims(replace=True)

    if save:
        model_name = prefix + "_" + str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(context) + "context"
        model.save(model_name)
        print("Model", model_name, "saved")
    
    return model

----
## Обучаем word2vec на наших текстах


In [3]:
import urllib
import urllib.request
import re
from bs4 import BeautifulSoup

print("Downloading")

# Толстой 
wp_txt = urllib.request.urlopen("https://www.gutenberg.org/files/2600/2600-h/2600-h.htm")

print("Parsing")
soup = BeautifulSoup(wp_txt)

print("Cleaning")
wp_txt = soup.find('body').get_text()

print("Downloading")
ak_txt = urllib.request.urlopen("http://www.gutenberg.org/files/1399/1399-0.txt")
ak_txt = ak_txt.read().decode("utf-8")


txt = wp_txt + " " + ak_txt
"Done"

Альтернативные данные

In [4]:
import re
txt = open("file1.txt").read() + " " + open("file2.txt").read()  + " " + open("file3.txt").read()

txt[:100]

'\nThe Project Gutenberg EBook of Anna Karenina, by Leo Tolstoy\n\nThis eBook is for the use of anyone a'

In [5]:
def prepare_sentences(txt, word_threshold=2, stage_train=True):

    # вычищаем переносы
    whitespaces = re.compile("\s+", re.U)
    txt = re.sub("\s+", " ", txt).lower()

    # убираем всё, кроме "слов", разбив на предложения
    sentences = re.split("[!\?\.]+", txt.replace("\n", " "))
    clean_sentences = [re.split("\W+", s) for s in sentences]
    clean_sentences = [[w.replace("\d+", "NUM") for w in s if w] for s in clean_sentences]
    
    if stage_train:

        counter = Counter()

        for s in clean_sentences:
            for w in s:
                counter[w] += 1
    
        print("Filtered out word types :", len([w for w in counter if counter[w] <= word_threshold]))
        print("Filtered out words count:", sum([counter[w] for w in counter if counter[w] <= word_threshold]))
    
        # выкидываем редкие, и заменяем их на специальный тег
        clean_sentences = [[w if counter[w] > word_threshold else UNK for w in s] for s in clean_sentences]            
    
    word2index = { }
    index2word = { }
    
    counter = max(word2index.values() if word2index else [0]) + 1

    for s in clean_sentences:
        for w in s:
            if not w in word2index:
                word2index[w] = counter
                index2word[counter] = w
                counter += 1
                
    return word2index, index2word, clean_sentences

In [6]:
word2index, index2word, clean_sentences = prepare_sentences(txt=txt, stage_train=False)

len(clean_sentences), " ".join(clean_sentences[:1][0])

(42638,
 'the project gutenberg ebook of anna karenina by leo tolstoy this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever')

In [7]:
w2v_model = train_word2vec(sentences=clean_sentences, 
                           prefix="tolstoy_", 
                           context=7, downsampling=0.0001,
                           min_word_count=10,
                           num_features=45)

Training Word2Vec model...
Model tolstoy__45features_10minwords_7context saved


In [8]:
w2v_model.most_similar("anna")

[('pdvlovna', 0.9271911382675171),
 ('countess', 0.9256846308708191),
 ('mikhdylovna', 0.9233927726745605),
 ('cess', 0.9135480523109436),
 ('natasha', 0.9118959307670593),
 ('mary', 0.9103429317474365),
 ('princess', 0.9097075462341309),
 ('daughter', 0.9034814238548279),
 ('sister', 0.9023302793502808),
 ('prin', 0.9015126824378967)]

# Посмотрим, как можно оценивать качество
Отличный источник, горячо рекомендуется
https://github.com/EloiZ/embedding_evaluation

Надо склонировать репозиторий, загрузить датасеты с помощью
`download_benchmarks.py`

In [9]:
import csv

# сохраним в CSV
with open("tolstoy__100features_3minwords_5context.csv", "w+") as wf:
    writer = csv.writer(wf)
    for word in w2v_model.wv.vocab:
        writer.writerow([word] + [v for v in  w2v_model.wv[word]])

In [10]:
import os

os.environ["EMBEDDING_EVALUATION_DATA_PATH"] = "embedding_evaluation/data/"

import embedding_evaluation
from embedding_evaluation.evaluate import Evaluation
from embedding_evaluation.load_embedding import load_embedding_textfile

def eval_word_vectors(path):
    # Load embeddings as a dictionnary {word: embed} where embed is a 1-d numpy array.
    embeddings = load_embedding_textfile(textfile_path=path)

    # Load and process evaluation benchmarks
    evaluation = Evaluation() 

    return evaluation.evaluate(embeddings)

In [11]:
tolstoy = eval_word_vectors("tolstoy__100features_3minwords_5context.csv")

tolstoy

{'concreteness': 0.31872708165611363,
 'similarity': {'men': {'all_entities': 0.4217564105042674,
   'entity_subset': 0},
  'sem_sim': {'all_entities': 0.18682128976041365, 'entity_subset': 0},
  'simlex': {'all_entities': 0.17514171209850715, 'entity_subset': 0},
  'usf': {'all_entities': 0.15724184655559115, 'entity_subset': 0},
  'vis_sim': {'all_entities': 0.1769497942246964, 'entity_subset': 0},
  'ws353': {'all_entities': 0.16772503095065688, 'entity_subset': 0}}}

## Задание

Здесь везде -- more is better. Попробуйте настроить модель так, чтобы similarity выросла.

Помогают ли советы?

----

## Сравним с гугловскими векторами

https://code.google.com/archive/p/word2vec/


`wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"`

и распаковать (но лучше взять готовую модель у меня)

In [12]:
import gensim
from gensim.models import KeyedVectors

# загружаться может долго
# w2v_ggl = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
# filtered_w2v_ggl KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [38]:
# import gensim
# from  gensim.models import KeyedVectors
# filtered_w2v_ggl = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.txt", binary=False)

In [104]:
from  gensim.models import KeyedVectors

filtered_w2v_ggl = KeyedVectors.load("filtered-GoogleNews-vectors-negative300.bin")

In [107]:
gnews = eval_word_vectors("GoogleNews-vectors-negative300.csv")

### Значения метрик в читаемом виде

In [108]:
for key in gnews.keys():
    
    if type(gnews[key]) != dict:
        print(key, gnews[key], tolstoy[key])
    else:
        print(key)
        for kk in gnews[key].keys():
            if type(gnews[key][kk]) != dict:
                print("    ", kk, gnews[key][kk], tolstoy[key][kk])
            else:
                print("    ", kk)
                for kkk in gnews[key][kk].keys():
                    if type(gnews[key][kk][kkk]) != dict:
                        print("      ", kkk, gnews[key][kk][kkk], tolstoy[key][kk][kkk])
                    else:
                        print("      ", kkk, gnews[key][kk][kkk], tolstoy[key][kk][kkk])
                

concreteness 0.5719772752714409 0.31872708165611363
similarity
     usf
       all_entities 0.3603241806749815 0.15724184655559115
       entity_subset 0 0
     sem_sim
       all_entities 0.675366985636233 0.18682128976041365
       entity_subset 0 0
     ws353
       all_entities 0.6873764284967581 0.16772503095065688
       entity_subset 0 0
     men
       all_entities 0.7447273593551881 0.4217564105042674
       entity_subset 0 0
     vis_sim
       all_entities 0.5934008833294928 0.1769497942246964
       entity_subset 0 0
     simlex
       all_entities 0.3471553307624142 0.17514171209850715
       entity_subset 0 0


## Word2Vec по-русски

#### Нормализуем тексты

In [128]:
import re
import pymorphy2 
from pymorphy2.tokenizers import *
from functools import lru_cache
from tqdm import tqdm

LEMMATIZER = pymorphy2.MorphAnalyzer()

tags = re.compile("<[^>]*>")
html_codes = re.compile("&\w+;")
nums = re.compile("\d+")
nonalpha = re.compile("\W+", re.U)

def remove_html(txt):
    return html_codes.sub(" ", tags.sub(" ", txt))

def replace_nums(txt):
    return nums.sub("<num>", txt)

def tokenize(text):
    tokens = [t for t in simple_word_tokenize(text) if not nonalpha.match(t)]
    return tokens


@lru_cache(maxsize=1000000)
def lemmatize(word):
    p = LEMMATIZER.parse(word)[0]
    return p.normal_form, p.tag


def lemmatize_text(split_text):
    return re.sub("\s+", " ", " ".join([lemmatize(t)[0] for t in split_text]))

In [3]:
import zipfile
import urllib.request

print("Downloading")

# сжатый Толстой 
url = "https://aldebaran.ru/author/tolstoyi_lev/kniga_anna_karenina1878_ru/download.html.zip"
ak_ru_zip = urllib.request.urlopen(url).read()

with open("karenina.zip", "wb") as wf:
    wf.write(ak_ru_zip)

with zipfile.ZipFile("karenina.zip","r") as zip_ref:
    zip_ref.extractall("karenina_ru")

print("Parsing")

html = open("karenina_ru/Tolstoyi_L._Anna_KareninaI.html", encoding="windows-1251").read()

Downloading
Parsing


In [109]:
soup = BeautifulSoup(html)
print("Cleaning")
ak_ru_txt = soup.find('body').get_text()
ak_ru_txt[:200]

### Нормализуем тексты

In [131]:
import sklearn.feature_extraction.text
import scipy.sparse as sp
import nltk
from tqdm import tqdm_notebook
import pickle

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def prepare_sentences(text):

    # список всех предложений в датасете
    sentences_ru = []

    review_sents = sentence_tokenizer.tokenize(replace_nums(remove_html(text)))
    clean_sents = [lemmatize_text(tokenize(sentence)).split(" ") for sentence in review_sents]

    # список всех предложений в отзыве
    sentences_ru.extend(clean_sents)
    
    return sentences_ru

In [13]:
ak_sentences = prepare_sentences(ak_ru_txt)
ak_sentences[:1]

Ну такое

### Запускаем обучение

In [14]:
w2v_ak = train_word2vec("karenina_", ak_sentences, num_features=45, context=5, min_word_count=2)

In [15]:
w2v_ak.similar_by_word("смерть")

### Кластеризация word2vec-ов
способ снизить размерность и объединить синонимы в одну фичу

In [16]:
words = list(w2v_ak.wv.vocab.keys())
w2v_matrix = np.array([w2v_ak.wv[key] for key in words])
w2v_matrix.shape

In [111]:
from sklearn.cluster import MiniBatchKMeans

CLUSTERS = 200

clusterer = MiniBatchKMeans(n_clusters=CLUSTERS, verbose=0, init_size=500, random_state=124, batch_size=10000)
labels = clusterer.fit_predict(w2v_matrix)
labels

In [17]:
# заполняем пустыми списками
label2words = { label : [] for label in labels }
words2label = {}
i = 0

for label in labels:
    label2words[label].append(words[i])
    words2label[words[i]] = label
    i += 1
    
# распечатываем кластеры
for label in label2words:
    print(label)
    print(" ".join(label2words[label]))

# IMDB: задачка для word2vec

In [112]:
w2v_ggl = gensim.models.KeyedVectors.load("filtered-GoogleNews-vectors-negative300.bin")

In [43]:
import pandas as pd
from bs4 import BeautifulSoup
import re 
import nltk
from nltk.corpus import stopwords

train = pd.read_csv('imdb_data/labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
test = pd.read_csv('imdb_data/testData.tsv', header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv('imdb_data/unlabeledTrainData.tsv', header=0,  delimiter="\t", quoting=3)

In [113]:
def review_to_words(raw_review, remove_stops=True):   
    
    review_text = BeautifulSoup(raw_review).get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()     
    
    if remove_stops:
        stops = set(stopwords.words("english"))                 
        return [w for w in words if not w in stops]  
    else:
        return words

In [115]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    
        raw_sentences = tokenizer.tokenize(review.strip())
        sentences = []

        for raw_sentence in raw_sentences:        
            if len(raw_sentence) > 0:
                sentences.append(review_to_words(raw_sentence, remove_stopwords))
                
        return sentences


from tqdm import tqdm, tqdm_notebook

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
all_sentences_available = []

for review in tqdm_notebook(train["review"]):
    all_sentences_available.extend(review_to_sentences(review, tokenizer, remove_stopwords=True))

for review in tqdm_notebook(unlabeled_train["review"]):
    all_sentences_available.extend(review_to_sentences(review, tokenizer, remove_stopwords=True))

len(all_sentences_available)

In [46]:
# import pickle
# with open("all_sentences.bin", "wb") as wf:
#     pickle.dump(all_sentences_available, wf)

In [49]:
all_sentences_available = pickle.load(open("all_sentences.bin", "rb"))

In [30]:
w2v_imdb = train_word2vec("imdb", all_sentences_available)
w2v_imdb

Training Word2Vec model...
Model imdb_300features_5minwords_10context saved


<gensim.models.word2vec.Word2Vec at 0x7ffa31637438>

In [32]:
# with open("w2v_imdb", "wb") as wf:
#     pickle.dump(w2v_imdb, wf)

In [57]:
w2v_imdb = pickle.load(open("w2v_imdb", "rb"))

### Начинаем готовить датасет для классификации по средним векторам

In [59]:
def avg_vector_by_review(words, model):
    
    accumulator = np.zeros((model.vector_size,), dtype="float32")
    found_count = 0.
    not_found = 0.
    
    for word in words:
        if word in model.wv:
            found_count += 1
            accumulator +=  model.wv[word]
        else:
            not_found += 1
    
#     print("Not found percentage:", not_found / (not_found + found_count))
    
    return accumulator / found_count

In [61]:
def avg_vectors_for_dataset(reviews, model):
    return np.matrix([avg_vector_by_review(review_to_words(review), model) for review in tqdm_notebook(reviews)])

In [62]:
train_vecs = avg_vectors_for_dataset(train["review"], w2v_ggl)
test_vecs = avg_vectors_for_dataset(test["review"], w2v_ggl)

train_vecs.shape, test_vecs.shape




 17%|█▋        | 4198/25000 [00:27<02:16, 152.02it/s][A[A[A

((25000, 300), (25000, 300))

### Будем предсказывать на СРЕДНИХ векторных представлениях слов

In [63]:
import sklearn
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split

def main_train(model, param_grid, train_vecs, y, test_vecs):

    # перебор гиперпараметров по сетке; по дефолту кросс-валидация StratifiedKFold
    clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=7, verbose=1)

    # отложим в сторону holdout, чтобы убедиться, 
    # что с нашими оценками на kfold всё в порядке
    X_train, X_ho, y_train, y_ho = train_test_split(train_vecs, 
                                                    train.sentiment, 
                                                    test_size=0.2, 
                                                    random_state=42)
    clf.fit(X_train, y_train)

    print("Best score:", clf.best_score_)
    print("Best params:", clf.best_params_)

    # задаём модели лучшие найденные параметры
    model = clf.best_estimator_

    # обучаем на всём, кроме холдаута
    model = model.fit(X_train, y_train)

    # смотрим на качество предсказаний на холдауте
    # должно быть похоже на оценку от поиска по сетке
    print("Holdout score:", model.score(X_ho, y_ho), "-- is it close to the validation score?")

    # обучаем модель на всей размеченной выборке
    model.fit(train_vecs, train.sentiment)
    result = model.predict(test_vecs)

    
    return result

In [64]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(n_jobs=7)

# надо ещё перебрать соотв. числа!
penalties = [ "l1", "l2", "elasticnet"]
losses = [ "modified_huber", "log", "huber", "hinge" ]

# какие параметры рассмотрим
param_grid = { "penalty": penalties, "average": [True, False], "loss": losses  }

result = main_train(model, param_grid, train_vecs, train.sentiment, test_vecs)

# запись сабмишшена для кегла
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("sgd-submission_wordvec_avg.csv", index=False, quoting=3)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    3.2s
[Parallel(n_jobs=7)]: Done  72 out of  72 | elapsed:    6.2s finished


Best score: 0.8514
Best params: {'average': True, 'penalty': 'l2', 'loss': 'modified_huber'}
Holdout score: 0.8554 -- is it close to the validation score?


### А теперь давайте использовать word2vec-и по-умному

На основе многоканальной архитектуры Yoon Kim

https://github.com/castorini/Castor/tree/master/kim_cnn

In [184]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class KimCNN(nn.Module):
    
    def __init__(self, vectors, dropout_rate, input_channel, output_channel, target_class, maxlen):
        
        super(KimCNN, self).__init__()
        
        # число свёрток с разными окнами
        Ks = 3 
        
        self.non_static_embed = nn.Embedding.from_pretrained(vectors, freeze=False)

        self.conv1 = nn.Conv1d(input_channel, output_channel, ???, padding=(???, 0))
        self.conv2 = nn.Conv1d(input_channel, output_channel, ???, padding=(???, 0))
        self.conv3 = nn.Conv1d(input_channel, output_channel, ???, padding=(???, 0))

        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(Ks * output_channel, target_class)

    def forward(self, x):
        
        non_static_input = self.non_static_embed(x)
        
        print()
        print(non_static_input.shape)
        
        # (batch, channel_input, sent_len, embed_dim)
        x = non_static_input.unsqueeze(???) 
        
        print(x.shape)
        
        # применяем свёртку к данным, и вычисляем функцию активации ReLU
        x = [F.relu(self.conv1(x)), #.squeeze(???), 
             F.relu(self.conv2(x)), #.squeeze(???), 
             F.relu(self.conv3(x))] #.squeeze(???)]
        
        # (batch, channel_output, ~=sent_len) * Ks
        x = [F.max_pool1d(i, i.size(???)).squeeze(???) for i in x] # max-over-time pooling
        
        # (batch, channel_output) * Ks
        x = torch.cat(x, 1) 
        
        print(x.shape)
        x = self.dropout(x)
        
        print(x.shape)
        
        # (batch, target_size)
        logit = self.fc1(x) 
        
        return logit

In [186]:
def word_ids_by_review(words, model, word2id, max_len):
    
    accumulator = []
    found_count = 0.
    not_found = 0.
    
    for word in words[:max_len]:
        if word in model.wv:
            accumulator.append(word2id[word])
        else:
            accumulator.append(-1)
    
    for _ in range(0, max_len - len(words)):
        accumulator.append(-1)
    
    accumulator = np.array(accumulator)
    
    return accumulator

### Строим матрицу векторов

In [187]:
embeddings = []

word2ix = {}

for id, w in enumerate(filtered_w2v_ggl.wv.vocab):
    embeddings.append(filtered_w2v_ggl.wv[w])
    word2ix[w] = id

embeddings = np.array(embeddings)

embeddings.shape

(37065, 300)

In [1]:
import torch
from torch import optim

losses = []
loss_function = nn.NLLLoss()
vocab_size = len(word2index)

EMBEDDING_DIM = 300
HIDDEN_DIM = 300
MAX_LEN = 50

cnn_model = KimCNN(dropout_rate=0.4, 
                   input_channel=1, 
                   output_channel=1, 
                   target_class=1,
                   vectors=torch.tensor(embeddings),
                   maxlen=MAX_LEN).cuda()

optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)

cnn_model, optimizer

In [189]:
from tqdm import tqdm_notebook
import numpy as np

EPOCHS = 2

def train_routine(model, loss_function, batches, epochs=30):

    for epoch in range(EPOCHS):

        total_loss = 0
        count = 0

        for features_batch, target_batch in tqdm_notebook(batches):

            word_vectors_reviews = torch.tensor(features_batch, dtype=torch.long).cuda()

            # градиенты надо сбрасывать, если не хотим аккумулировать
            model.zero_grad()

            # применяем модель
            log_probs = model(word_vectors_reviews)

            # вычисляем невязку
            loss = loss_function(log_probs, torch.tensor(target_batch, dtype=torch.long).cuda())

            # обратный проход, обновление градиента
            loss.backward()
            optimizer.step()

            # получаем число
            total_loss += loss.item()
            count += 1

        print("E", epoch + 1, "\tNLL\t", total_loss / count)

        losses.append(total_loss)
        
    return model, losses

In [190]:
def chunks(l0, l1, n):
    
    assert len(l0) == len(l1)
    coll0, coll1 = [], []
    
    for i in tqdm_notebook(range(0, len(l0), n)):
        coll0.append(l0[i:i + n])
        coll1.append(l1[i:i + n])
        
    return coll0, coll1

In [191]:
word_ids_by_review(review_to_words("hello"), filtered_w2v_ggl, word2ix, max_len=MAX_LEN)

array([32326,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
          -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
          -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
          -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
          -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
          -1,    -1,    -1,    -1,    -1])

In [192]:
data = [word_ids_by_review(review_to_words(txt), filtered_w2v_ggl, word2ix, MAX_LEN) for txt in train["review"]]

In [193]:
len(data), len(train)

(25000, 25000)

In [194]:
batched_texts, batched_targets = chunks(data, list(train.sentiment), n=10)

len(batched_texts), len(batched_targets)

(2500, 2500)

In [195]:
# batched_texts[0]

In [197]:
batches = list(zip(batched_texts, batched_targets))

train_routine(cnn_model, loss_function, batches, epochs=1)

RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.c:20