In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import re
from torchtext.vocab import build_vocab_from_iterator
from collections import defaultdict
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def seed_everything(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

seed = 3407
seed_everything(seed)

stop_en = stopwords.words('english')
stop_ru = stopwords.words('russian')

In [None]:
filename = "/content/drive/MyDrive/ASR/lemma/lemma_train.csv"
df = pd.read_csv(filename)

In [None]:
columns = ["lemma"]
df = df[columns]

In [None]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        u"\200d"
        u"\2063"
        u"\2063"
        u"\u200b"
        u"\ufeff"
        "]+", re.UNICODE)
    return re.sub(emoj, ' ', data)

In [None]:
def parse_lemma(lemmas):
    output = []
    for lemma in lemmas:
        tokens = re.findall(r"'[^']+'", lemma)
        sent = []
        for token in tokens:
            token = token.replace("'", "").replace("«", "").replace("»", "")
            token = remove_emojis(token).replace("\\ufeff", "").replace("\\u200b", "").replace("\\u200", "").replace("\\U0001faf1", "").replace("\\U0001faf6", "").replace("\\U0001faf2", "").replace("\\U0001fac5", "").replace("®", "").replace("\\u206", "").replace("\\u2066", "").replace("\\u2069", "").replace(" ", "").replace("\\U0001faa9", "").replace("\\U0001f979", "").replace("\\U0001fab8", "").replace("\\uf8ff", "").replace("•", "").replace("מאייר", "").replace("אִיוּר", "").replace("\\U0001fae7", "").replace("\\U0001f979", "").replace("\\U0001fae2", "").replace("—", "").strip()
            token = token.replace("⏱", "").replace("⏰", "").replace("⏭", "").replace("⌛", "").replace("∆", "").replace("↗", "").replace("↖", "").replace("⃣₽", "").replace("⃣", "").replace("?", "").replace("!", "").replace("\\U0001fae", "")
            token = token.replace("‼", "").replace("₽", "").replace("⁉", "").replace("…", "").replace("ᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠ", "")
            if token and len(token) > 2 and token not in stop_en and token not in stop_ru:
                sent.append(token)
        output.append(remove_emojis(" ".join(sent)).strip().split())
    return output

In [None]:
tokenized_corpus = parse_lemma(df["lemma"].to_list())

In [None]:
vocab = build_vocab_from_iterator(
    tokenized_corpus,
    specials=["<unk>"],
    min_freq=1,
)
vocab.set_default_index(vocab["<unk>"])

In [None]:
len(vocab)

223989

In [None]:
window_size = 2
idx_pairs = []
for sentence in tokenized_corpus:
  indices = [vocab[word] if word in word else 0 for word in sentence]
  for center_word_pos in range(len(indices)):
    for w in range(-window_size, window_size+1):
      context_word_pos = center_word_pos + w
      if (context_word_pos < 0) or (context_word_pos >= len(indices)) or center_word_pos == context_word_pos:
        continue
      context_word_idx = indices[context_word_pos]
      idx_pairs.append((indices[center_word_pos], context_word_idx))

dataset_pairs = torch.tensor(idx_pairs)
dataset_pairs[:5]

tensor([[ 9629, 12878],
        [ 9629, 15783],
        [12878,  9629],
        [12878, 15783],
        [12878, 14509]])

In [None]:
class SkipgramDataset(Dataset):
    def __init__(self, dataset_pairs):
        self.pairs = dataset_pairs

    def __getitem__(self, item):
        middle_word, target_word = self.pairs[item]
        return middle_word, target_word

    def __len__(self):
        return len(self.pairs)

dataset = SkipgramDataset(dataset_pairs)
dataloader = DataLoader(dataset=dataset, shuffle=True, batch_size=1024)

In [None]:
class SkipGram(nn.Module):
    """
    Implementation of Skip-Gram model described in paper:
    https://arxiv.org/abs/1301.3781
    """
    def __init__(self, vocab_size: int, embed_dim: int):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
        )
        self.linear = nn.Linear(
            in_features=embed_dim,
            out_features=vocab_size,
        )

    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = self.linear(x)
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
embed_dim = 100
lr = 0.001
EPOCHS = 27

#model = SkipGram(len(vocab.get_itos()), embed_dim=embed_dim).to(device)
criterion = nn.CrossEntropyLoss()
#opt = optim.Adam(model.parameters(), lr=lr)

losses = []
for epoch in range(21, EPOCHS):
    loss_acc = []
    for input, output in dataloader:
        opt.zero_grad()
        input = input.to(device)
        output = output.to(device)

        preds = model(input)
        loss = criterion(preds, output)

        loss_acc.append(loss.detach().cpu().item())
        loss.backward()
        opt.step()
    loss_acc = np.mean(loss_acc)
    print(f"Epoch: {epoch + 1}/{EPOCHS} | Loss :{loss_acc}")

Epoch: 22/27 | Loss :6.031286530675858
Epoch: 23/27 | Loss :6.0079461943875305
Epoch: 24/27 | Loss :5.98644970888413
Epoch: 25/27 | Loss :5.966037571918648
Epoch: 26/27 | Loss :5.947593403004641
Epoch: 27/27 | Loss :5.930054538741963


In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/ASR/word2vec/word2vec_6.pt") # 4.pt: 6.2211307811476555 5: 6.056594998895131

In [None]:
# '⃣₽'.replace("⏱", "").replace("⏰", "").replace("⏭", "").replace("⌛", "").replace("∆", "").replace("↗", "").replace("↖", "").replace("⃣₽", "").replace("⃣", "").replace("?", "").replace("!", "").replace("\\U0001fae", "")

''

In [None]:
import json
with open("/content/drive/MyDrive/ASR/word2vec/word2vec_2.json","w") as f:
    data = json.dump(vocab.get_stoi(), f, indent=4)


In [None]:
target = model.embeddings._parameters["weight"][17085]

In [None]:
vocab["матушка"]

17085

In [None]:
model.embeddings._parameters["weight"].shape

torch.Size([225160, 100])

In [None]:
def closest(target, vec):
    dists = torch.sqrt(((target - vec) ** 2).sum())
    return dists.argmin()

In [None]:
closest_vec = []
for i in range(model.embeddings._parameters["weight"].shape[0]):
    closest_vec.append(closest(target, model.embeddings._parameters["weight"][i]))

In [None]:
torch.topk(torch.tensor(closest_vec), k=20, largest=False)

torch.return_types.topk(
values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
indices=tensor([18, 10,  4, 16, 12,  8, 19,  1, 11,  5, 15, 13,  7, 17,  3,  9,  0,  2,
         6, 14]))

In [None]:
idx2word = {v: k for k, v in vocab.get_stoi().items()}