In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import re

nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [2]:
tweets_df = pd.read_csv('/kaggle/input/customer-support-on-twitter/twcs/twcs.csv')
querries = tweets_df.text.tolist()

In [3]:
def flat_map(f, xs):
    result = []
    for x in xs:
        result.extend(f(x))
    return result

In [4]:
def pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [5]:
def lemmatize(token_tags):
    result = []
    for token, tag in token_tags:
        pos_tag = pos(tag)
        if pos_tag:
            result.append(lemmatizer.lemmatize(token.lower(), pos_tag))
    return result

def rm_stop_words(words):
    return list(filter(lambda x: x is not None and not x.startswith("@"), words))

In [20]:
lemmatizer = WordNetLemmatizer()
tokens = []
for text_part in tqdm(querries[:150000]):
    tokens += list(map(rm_stop_words, 
                          map(lemmatize, 
                              map(lambda sentence: nltk.pos_tag(word_tokenize(sentence), lang='rus'), 
                                       sent_tokenize(text_part)))))

  0%|          | 0/150000 [00:00<?, ?it/s]


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_ru[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_ru')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle[0m

  Searched in:
    - '/root/nltk_data'
    - '/opt/conda/nltk_data'
    - '/opt/conda/share/nltk_data'
    - '/opt/conda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [7]:
good_pattern = re.compile("[a-zA-Z0-9']+", flags=re.UNICODE)

In [8]:
filtered = list(filter(lambda x: len(x) > 1, tokens))

In [9]:
tokens = list(map(lambda sent: list(filter(lambda word: 
                                      bool(good_pattern.fullmatch(word)), sent)), filtered))

In [10]:
vocabulary_set = set(flat_map(lambda x: x, tokens))

In [11]:
idx2word = dict(enumerate(vocabulary_set))
word2idx = {v: k for k, v in idx2word.items()}

In [12]:
def skipgram_dataset(tokens, window_size=3):
    dataset = []
    for sentence in tokens:
        encoded_sentence = list(map(lambda x: word2idx[x], sentence))
        for token_pos, token in enumerate(encoded_sentence[window_size:-window_size], start=window_size):
            context = encoded_sentence[token_pos - window_size:token_pos] + encoded_sentence[token_pos + 1:token_pos + window_size + 1]
            dataset += [(torch.tensor(token), torch.tensor(ctx_token)) for ctx_token in context]
    return dataset

def cbow_dataset(tokens, window_size=3):
    dataset = []
    for sentence in tokens:
        encoded_sentence = list(map(lambda x: word2idx[x], sentence))
        for token_pos, token in enumerate(encoded_sentence[window_size:-window_size], start=window_size):
            context = encoded_sentence[token_pos - window_size:token_pos] + encoded_sentence[token_pos + 1:token_pos + window_size + 1]
            dataset.append((torch.tensor(context), torch.tensor(token)))
    return dataset

In [13]:
class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, dim):
            super().__init__()
            self.encoder = nn.Embedding(vocab_size, dim)
            weight_init_bound = 0.5 / dim
            self.encoder.weight.data.uniform_(-weight_init_bound, weight_init_bound)
            self.classifier = nn.Linear(dim, vocab_size)
            self.vocab_size = vocab_size
    
    def forward(self, x):
        raise Exception("unimplemented")
            
class SkipGramModel(Word2VecModel):
    def forward(self, x):
        return self.classifier(self.encoder(x))

class CBOWModel(Word2VecModel):
    def forward(self, x):
        encoded = self.encoder(x)
        mean_context_vec = encoded.mean(dim=1)
        return self.classifier(mean_context_vec)

In [14]:
def train(model, dataloader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        epoch_loss, cnt = 0, 0
        for x, y in tqdm(dataloader):
            optimizer.zero_grad()
            x, y = x.cuda(), y.cuda()
            preds = model(x)
            loss = criterion(preds, y)
            epoch_loss += loss.item()
            cnt += 1
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch}: mean_loss = {epoch_loss / cnt}")
    return model

In [16]:
loader = DataLoader(skipgram_dataset(tokens), batch_size=64, shuffle=True)
w2v = SkipGramModel(len(vocabulary_set), 100).cuda()
optimizer = optim.AdamW(w2v.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()
trained_skipgram = train(w2v, loader, criterion, optimizer, 10)

100%|██████████| 44443/44443 [02:21<00:00, 313.58it/s]


Epoch 0: mean_loss = 7.186765206213396


100%|██████████| 44443/44443 [02:22<00:00, 312.94it/s]


Epoch 1: mean_loss = 6.905425988514417


100%|██████████| 44443/44443 [02:21<00:00, 313.53it/s]


Epoch 2: mean_loss = 6.813994961537243


100%|██████████| 44443/44443 [02:22<00:00, 312.66it/s]


Epoch 3: mean_loss = 6.754774928068816


100%|██████████| 44443/44443 [02:22<00:00, 312.88it/s]


Epoch 4: mean_loss = 6.718189181027896


100%|██████████| 44443/44443 [02:21<00:00, 313.25it/s]


Epoch 5: mean_loss = 6.694250118281022


100%|██████████| 44443/44443 [02:21<00:00, 313.59it/s]


Epoch 6: mean_loss = 6.677204001001628


100%|██████████| 44443/44443 [02:21<00:00, 313.60it/s]


Epoch 7: mean_loss = 6.664848562101593


100%|██████████| 44443/44443 [02:21<00:00, 313.54it/s]


Epoch 8: mean_loss = 6.655264619891174


100%|██████████| 44443/44443 [02:21<00:00, 313.35it/s]

Epoch 9: mean_loss = 6.646360891428555





In [17]:
loader = DataLoader(cbow_dataset(tokens), batch_size=64, shuffle=True)
w2v = CBOWModel(len(vocabulary_set), 100).cuda()
optimizer = optim.AdamW(w2v.parameters(), lr=8e-4)
criterion = nn.CrossEntropyLoss()
trained_cbow = train(w2v, loader, criterion, optimizer, 15)

100%|██████████| 7408/7408 [00:24<00:00, 305.61it/s]


Epoch 0: mean_loss = 7.223738310275274


100%|██████████| 7408/7408 [00:23<00:00, 308.93it/s]


Epoch 1: mean_loss = 6.527001478180257


100%|██████████| 7408/7408 [00:24<00:00, 307.72it/s]


Epoch 2: mean_loss = 6.20174361318028


100%|██████████| 7408/7408 [00:24<00:00, 308.26it/s]


Epoch 3: mean_loss = 5.970465596177923


100%|██████████| 7408/7408 [00:24<00:00, 307.69it/s]


Epoch 4: mean_loss = 5.78387161812566


100%|██████████| 7408/7408 [00:23<00:00, 308.81it/s]


Epoch 5: mean_loss = 5.624774537706993


100%|██████████| 7408/7408 [00:23<00:00, 308.68it/s]


Epoch 6: mean_loss = 5.485456250656243


100%|██████████| 7408/7408 [00:24<00:00, 308.23it/s]


Epoch 7: mean_loss = 5.360456347594251


100%|██████████| 7408/7408 [00:24<00:00, 307.36it/s]


Epoch 8: mean_loss = 5.249206174784551


100%|██████████| 7408/7408 [00:24<00:00, 304.39it/s]


Epoch 9: mean_loss = 5.147958979718629


100%|██████████| 7408/7408 [00:24<00:00, 308.20it/s]


Epoch 10: mean_loss = 5.056805751332711


100%|██████████| 7408/7408 [00:24<00:00, 308.36it/s]


Epoch 11: mean_loss = 4.973473553233991


100%|██████████| 7408/7408 [00:24<00:00, 306.99it/s]


Epoch 12: mean_loss = 4.89821871230077


100%|██████████| 7408/7408 [00:24<00:00, 307.31it/s]


Epoch 13: mean_loss = 4.829506225546029


100%|██████████| 7408/7408 [00:24<00:00, 308.41it/s]

Epoch 14: mean_loss = 4.766646905413972





In [18]:
def negative_sampling(bag_of_words, n_negatives=10):
    import random
    from collections import Counter
    frequencies = dict(Counter(map(lambda x: word2idx[x], bag_of_words)))
    total = sum(frequencies.values())
    scores = np.asarray(list(frequencies.values())) ** (3 / 4)
    neg_indices = np.asarray(list(frequencies.keys()))
    scores = scores / scores.sum()
    
    def criterion(preds, targets):
        random_neg_matrix = np.random.choice(neg_indices, (len(preds), n_negatives), replace=False, p=scores)
        all_negatives = torch.from_numpy(random_neg_matrix).long().cuda()
        loss = F.logsigmoid(preds.gather(1, targets.view(-1, 1))) + F.logsigmoid(-preds.gather(1, all_negatives)).sum(dim=1)
        return -loss.mean()
    return criterion

In [19]:
loader = DataLoader(skipgram_dataset(tokens), batch_size=64, shuffle=True)
w2v = SkipGramModel(len(vocabulary_set), 100).cuda()
optimizer = optim.AdamW(w2v.parameters(), lr=1e-3)
criterion = negative_sampling(flat_map(lambda x: x, tokens))
trained_skipgram_neg_sample = train(w2v, loader, criterion, optimizer, 10)

100%|██████████| 44443/44443 [03:20<00:00, 222.10it/s]


Epoch 0: mean_loss = 2.8267150283586706


100%|██████████| 44443/44443 [03:19<00:00, 222.33it/s]


Epoch 1: mean_loss = 2.521597244819304


100%|██████████| 44443/44443 [03:19<00:00, 222.25it/s]


Epoch 2: mean_loss = 2.461357638896688


100%|██████████| 44443/44443 [03:19<00:00, 222.30it/s]


Epoch 3: mean_loss = 2.4271082727609152


100%|██████████| 44443/44443 [03:20<00:00, 221.79it/s]


Epoch 4: mean_loss = 2.4067710697727356


100%|██████████| 44443/44443 [03:20<00:00, 221.84it/s]


Epoch 5: mean_loss = 2.3939527981572524


100%|██████████| 44443/44443 [03:20<00:00, 222.15it/s]


Epoch 6: mean_loss = 2.3850940919369297


100%|██████████| 44443/44443 [03:19<00:00, 222.24it/s]


Epoch 7: mean_loss = 2.378828193595648


100%|██████████| 44443/44443 [03:20<00:00, 221.83it/s]


Epoch 8: mean_loss = 2.3744361764296786


100%|██████████| 44443/44443 [03:19<00:00, 222.22it/s]

Epoch 9: mean_loss = 2.371357434977695





In [21]:
def get_top_k_neares_words(center_vector, model: Word2VecModel, top_k=10):
    metric = lambda x, y: (x * y).sum()
    
    context_embeds = model.classifier.weight.detach().cpu().numpy()
    norms = np.linalg.norm(context_embeds, axis=1)
    context_embeds = context_embeds / norms[:, None]
    
    word_embed = center_vector / np.linalg.norm(center_vector)
    
    idx_and_scores = list(sorted(enumerate(map(lambda candidate: metric(word_embed, candidate), context_embeds)), key=lambda x: -x[1]))
    return list(map(lambda x: (idx2word[x[0]], x[1]), idx_and_scores[:top_k]))

In [22]:
def vector_supplier(model: Word2VecModel, word2idx: dict):
    space = model.encoder.weight.detach().cpu().numpy()
    def get_vector_for(name: str) -> np.array:
        idx = word2idx[name]
        return space[idx]
    return get_vector_for

In [23]:
vecs = vector_supplier(trained_skipgram, word2idx)
get_top_k_neares_words(vecs('daughter'), trained_skipgram)

[('daughter', 0.34646615),
 ('birthday', 0.28236315),
 ('christmas', 0.25895885),
 ('glad', 0.22095025),
 ('give', 0.21278794),
 ('kid', 0.2125443),
 ('old', 0.20815909),
 ('gift', 0.2015148),
 ('year', 0.19457708),
 ('home', 0.18746592)]

In [24]:
vecs = vector_supplier(trained_cbow, word2idx)
get_top_k_neares_words(vecs('daughter'), trained_cbow)

[('birthday', 0.45944268),
 ('daughter', 0.40525854),
 ('christmas', 0.39061636),
 ('pair', 0.3562856),
 ('walmart', 0.3271615),
 ('sister', 0.3187337),
 ('halloween', 0.2927508),
 ('xmas', 0.2860467),
 ('turkey', 0.28597766),
 ('drink', 0.28179526)]

In [25]:
vecs = vector_supplier(trained_skipgram_neg_sample, word2idx)
get_top_k_neares_words(vecs('daughter'), trained_skipgram_neg_sample)

[('daughter', 0.172947),
 ('birthday', 0.16577537),
 ('christmas', 0.16504355),
 ('kid', 0.10893779),
 ('have', 0.096102834),
 ('s', 0.081161775),
 ('son', 0.07468467),
 ('be', 0.0742749),
 ('movie', 0.0740296),
 ('get', 0.07395441)]