<a href="https://colab.research.google.com/github/mind-matrix/research-8th-sem-project/blob/main/Tokenization_and_Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install --quiet bert_embedding

[K     |████████████████████████████████| 13.8MB 325kB/s 
[K     |████████████████████████████████| 215kB 57.7MB/s 
[K     |████████████████████████████████| 29.6MB 145kB/s 
[?25h  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
[31mERROR: xarray 0.15.1 has requirement numpy>=1.15, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: umap-learn 0.5.1 has requirement numpy>=1.17, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: tifffile 2021.3.4 has requirement numpy>=1.15.1, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: tensorflow 2.4.1 has requirement numpy~=1.19.2, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: spacy 2.2.4 has requirement numpy>=1.15.0, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: seaborn 0.11.1 has requirement numpy>=1.15, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: pyerfa 1.7.2 has requirement numpy>=1.16, but you'll have nump

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action = 'ignore') # Fix for unnecessary warnings in output

import gensim
from gensim.models import Word2Vec
import numpy as np
from scipy import spatial
from sklearn.manifold import TSNE
from bert_embedding import BertEmbedding
import nltk

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
text = "Wikipedia is a free, multilingual open-collaborative online encyclopedia created and maintained by a community of volunteer contributors using a wiki-based editing system. Wikipedia is the largest general reference work on the Internet, and one of the 15 most popular websites as ranked by Alexa; in 2021, it was ranked as the 13th most visited."

In [6]:
class Word2VecModel:
  def __init__(self, model):
    self.model = model
  def most_similar(self, word, top_n=1):
    return self.model.most_similar(positive=[word], topn=top_n)
  def wv(self, word):
    word = word.lower()
    try:
      return self.model.wv[word]
    except:
      return self.model.wv[self.most_similar(word)]
  def embeddings(self, sentences):
    result = []
    for i, sentence in enumerate(sent_tokenize(sentences)):
      result.append([])
      for word in word_tokenize(sentence):
        result[i].append(np.array(self.wv(word)))
      result[i] = np.array(result[i])
    return np.array(result)

def word2vecModel(text, cbow=True, min_count=1, size=100, window=5, sg=1):
  sentences = sent_tokenize(text)
  corpus = []
  for sentence in sentences:
    words = [ i.lower() for i in word_tokenize(sentence) ]
    corpus.append(words)
  if cbow:
    model = gensim.models.Word2Vec(corpus, min_count = min_count, size = size, window = window)
  else:
    model = gensim.models.Word2Vec(corpus, min_count = min_count, size = size, window = window)
  return Word2VecModel(model)

In [7]:
word2vec_cbow_model = word2vecModel(text, cbow=True)
print('Word2Vec CBOW Embeddings Shape:')
embeddings = word2vec_cbow_model.embeddings(text)
for sent in embeddings:
  print(sent.shape)

Word2Vec CBOW Embeddings Shape:
(24, 100)
(36, 100)


In [8]:
word2vec_sg_model = word2vecModel(text, cbow=False)
print('Word2Vec SkipGram Embeddings Shape:')
embeddings = word2vec_sg_model.embeddings(text)
for sent in embeddings:
  print(sent.shape)

Word2Vec SkipGram Embeddings Shape:
(24, 100)
(36, 100)


In [9]:
class GloveModel:
  def __init__(self, embeddings_dict):
    self.embeddings_dict = embeddings_dict
  def most_similar(self, word, top_n=1):
    return sorted(self.embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(self.embeddings_dict[word], embedding))[:top_n]
  def wv(self, word):
    word = word.lower()
    return self.embeddings_dict[word] if word in self.embeddings_dict else self.embeddings_dict[self.most_similar(word)]
  def embeddings(self, sentences):
    result = []
    for i, sentence in enumerate(sent_tokenize(sentences)):
      result.append([])
      for word in word_tokenize(sentence):
        result[i].append(np.array(self.wv(word)))
      result[i] = np.array(result[i])
    return np.array(result)

def gloveModel(pretrained_embeddings_txt_file):
  embeddings_dict = dict()
  with open(pretrained_embeddings_txt_file, 'r', encoding="utf-8") as f:
    for line in f:
      values = line.split()
      word = values[0].lower()
      vector = np.asarray(values[1:], "float32")
      embeddings_dict[word] = vector
  return GloveModel(embeddings_dict)

In [10]:
# glove_model = gloveModel()
# print("GloVe Embeddings Shape: ")
# embeddings = glove_model.embeddings(text)
# for sent in embeddings:
#   print(sent.shape)

In [11]:
class BertModel:
  def __init__(self, bert):
    self.bert = bert
  def embeddings(self, sentences):
    embeddings = self.bert(sent_tokenize(sentences))
    return list(map(lambda x: np.array(x[1]), embeddings))

def bertModel():
  bert = BertEmbedding()
  return BertModel(bert)

In [12]:
bert_model = bertModel()
print("BERT Embeddings Shape: ")
embeddings = bert_model.embeddings(text)
for sent in embeddings:
  print(sent.shape)

Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/book_corpus_wiki_en_uncased-a6607397.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip...
Downloading /root/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip...
BERT Embeddings Shape: 
(21, 768)
(23, 768)
