# Word Embedding
https://www.youtube.com/watch?v=5PL0TmQhItY

https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb

In [1]:
!pip install torch nltk pandas seaborn numpy



In [2]:
import nltk
import torch
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
file_name = "gdrive/MyDrive/tweets.csv"

Mounted at /content/gdrive


In [4]:
from nltk.stem import WordNetLemmatizer
stopwords = nltk.corpus.stopwords.words('english')
  
lemmatizer = WordNetLemmatizer()

In [5]:
tweets_df = pd.read_csv(file_name,  header=None, encoding='ISO-8859-1', names=["target", "ids", "date", "flag", "user", "text"])

In [6]:
tweets_df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
sentences = [nltk.sent_tokenize(tweet) for tweet in tweets_df["text"].iloc[:10000]]
sentences = list(itertools.chain(*sentences))

def cleanup_text(sentences):
  clean_sentences = []
  for sentence in sentences:
    clean_sentence = []
    for token in sentence.split(" "):
      if token in stopwords or not token.isalpha():
        continue
      clean_sentence.append(lemmatizer.lemmatize(token))
    clean_sentences.append(clean_sentence)
  return clean_sentences


# simplify and clean sentences
sentences = cleanup_text(sentences)

In [8]:
vocabulary = []
for sentence in sentences:
  for token in sentence:
    if token not in vocabulary:
      vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [75]:
window_size = 2

ngrams = [nltk.ngrams(sentence, window_size) for sentence in sentences if len(sentence) > window_size]
ngrams = list(itertools.chain(*ngrams))

def get_ngrams_idx(ngrams):
  """
  Function to map ngram of variable length to idx
  """
  ngrams_idx = []
  for ngram in ngrams:
    word_idx = []
    for word in ngram:
      word_idx.append(word2idx[word])
    ngrams_idx.append(tuple(word_idx))
  return ngrams_idx


idx_pairs = get_ngrams_idx(ngrams)

# Word2Vec implementation

In [76]:
from torch.autograd import Variable
import torch.nn.functional as F

def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        #loss_val += loss.data[0]
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 11.479687444705428
Loss at epo 10: 10.469998354652333


KeyboardInterrupt: ignored

# Matrix Decomposition

# Glove

# Keras embedding layer