<a href="https://colab.research.google.com/github/mmsamiei/lets-pytorch/blob/master/ferdosi_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [3]:
%cd gdrive/My\ Drive/
%cd datasets/
%cd ./Ferdosi

/content/gdrive/My Drive
/content/gdrive/My Drive/datasets
/content/gdrive/My Drive/datasets/Ferdosi


In [0]:
beyts = []
with open('ferdosi.txt', 'r') as fp:
  for line in fp:
    line = line.replace(',','')
    temp = line.rstrip("\n").split()
    beyts.append(temp)

In [0]:
word_arr = []
for beyt in beyts:
  for word in beyt:
    if word not in word_arr:
      word_arr.append(word)

In [6]:
len(word_arr)

17658

In [0]:
word_dict = {word:index for index,word in enumerate(word_arr)}

In [0]:
# Word2Vec Parameter
batch_size = 32
embedding_size = 25
voc_size = len(word_arr)
window_size = 5

In [33]:
def make_pairs(beyts, window_size):
  pairs = []
  for beyt in beyts:
    for i in range(len(beyt)-window_size):
      for j in range(1,window_size+1):
        pair_one = [beyt[i],beyt[i+j]]
        pair_two = [beyt[i+j],beyt[i]]
        pairs.append(pair_one)
        pairs.append(pair_two)
  return pairs

make_pairs(beyts[:1],2)

[['به', 'نام'],
 ['نام', 'به'],
 ['به', 'خداوند'],
 ['خداوند', 'به'],
 ['نام', 'خداوند'],
 ['خداوند', 'نام'],
 ['نام', 'جان'],
 ['جان', 'نام'],
 ['خداوند', 'جان'],
 ['جان', 'خداوند'],
 ['خداوند', 'و'],
 ['و', 'خداوند'],
 ['جان', 'و'],
 ['و', 'جان'],
 ['جان', 'خرد'],
 ['خرد', 'جان'],
 ['و', 'خرد'],
 ['خرد', 'و'],
 ['و', 'کزین'],
 ['کزین', 'و'],
 ['خرد', 'کزین'],
 ['کزین', 'خرد'],
 ['خرد', 'برتر'],
 ['برتر', 'خرد'],
 ['کزین', 'برتر'],
 ['برتر', 'کزین'],
 ['کزین', 'اندیشه'],
 ['اندیشه', 'کزین'],
 ['برتر', 'اندیشه'],
 ['اندیشه', 'برتر'],
 ['برتر', 'برنگذرد'],
 ['برنگذرد', 'برتر']]

In [0]:
def to_num(pairs):
  num_pairs = []
  for pair in pairs:
    num_pair = [word_dict[pair[0]], word_dict[pair[1]]]
    num_pairs.append(num_pair)
  return num_pairs

In [0]:
pairs = make_pairs(beyts, window_size)
num_pairs = to_num(pairs)

In [36]:
len(num_pairs)

3204730

In [0]:
x = np.array(num_pairs, dtype=float).T[0]
y = np.array(num_pairs, dtype=int).T[1]

In [0]:
x = torch.tensor(x.reshape((-1,1)))
y = torch.tensor(y.reshape((-1)))

In [0]:
from torch.utils.data import TensorDataset
train_ds = TensorDataset(x, y)

In [0]:
from torch.utils.data import DataLoader

train_dl = DataLoader(train_ds, batch_size=batch_size, drop_last=True)

In [0]:
def make_one_hot(x_train, y_train):
  inputs = torch.zeros((batch_size, voc_size))
  
  labels = torch.LongTensor(y_train)
  
  for i in range(batch_size):
    inputs[i][int(x_train[i])] = 1
  return inputs, labels

In [0]:
class Word2Vec(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin1 = nn.Linear(voc_size, embedding_size)
        self.lin2 = nn.Linear(embedding_size, voc_size)
        
    def forward(self, xb):
        temp = self.lin1(xb)
        return(self.lin2(temp))

In [43]:
print(torch.cuda.torch.cuda.is_available())

True


In [0]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [0]:
model = Word2Vec().to(dev)

In [0]:
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [52]:
for epoch in range(5):
  for xb,yb in train_dl:
    #xb = xb.to(dev)
    #yb = yb.to(dev)
    inputs, labels = make_one_hot(xb, yb)
    inputs = inputs.to(dev)
    labels = labels.to(dev)
    optimizer.zero_grad()
    output = model(inputs)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
  print("epoch {epoch}: loss is {loss}".format(epoch = epoch+1, loss = loss.item()))

epoch 1: loss is 5.044978141784668
epoch 2: loss is 4.9276652336120605
epoch 3: loss is 4.910666465759277
epoch 4: loss is 4.944161415100098
epoch 5: loss is 4.955267906188965


In [73]:
for name, param in model.named_parameters():
    if param.requires_grad:
        weights = param.data.cpu().numpy()
        print(name, weights.shape)

lin1.weight (25, 17658)
lin1.bias (25,)
lin2.weight (17658, 25)
lin2.bias (17658,)


In [65]:
!pip install tensorboardX
from tensorboardX import SummaryWriter

Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/5c/76/89dd44458eb976347e5a6e75eb79fecf8facd46c1ce259bad54e0044ea35/tensorboardX-1.6-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 3.4MB/s 
Installing collected packages: tensorboardX
Successfully installed tensorboardX-1.6


In [120]:
model_param = list(model.parameters())
embedding = model_param[0]
embedding = embedding.t()
#embedding = embedding[:100]
embedding.shape

torch.Size([17658, 25])

In [0]:
meta = word_arr

In [122]:
len(meta)

17658

In [0]:
import keyword
writer = SummaryWriter()
# while len(meta)<100:
#   meta = meta+keyword.kwlist # get some strings
  
# meta = meta[:100]
# for i, v in enumerate(meta):
#   meta[i] = v+word_arr[i]

# label_img = torch.rand(100, 3, 10, 32)
# for i in range(100):
#     label_img[i]*=i/100.0

#writer.add_embedding(torch.randn(100, 5), metadata=meta, label_img=label_img)
#writer.add_embedding(torch.randn(100, 5), label_img=label_img)
writer.add_embedding(embedding, metadata=meta)


In [0]:
!rm -r runs