In [1]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import spacy
import re

dataset= pd.read_csv('E://JupyterNotebook//New folder//NLP//nlp-getting-started//cleaned_dataset.csv')

In [100]:
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data = punct_tag.sub(r'',data)
    return data

def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

def remove_abb(data):
    abb_clean = re.compile(r"\b[a-zA-Z]\.[a-zA-Z]\b")
    data = abb_clean.sub(r'',data)
    return data

In [101]:
def clean(dataset):
    dataset['text'] = dataset['text'].apply(lambda z: remove_html(z))

    dataset['text'] = dataset['text'].apply(lambda z: remove_punctuations(z))

    dataset['text'] = dataset['text'].apply(lambda z: remove_url(z))

    dataset['text'] = dataset['text'].apply(lambda z: remove_emoji(z))

    dataset['text'] = dataset['text'].apply(lambda z: ' '.join([word for word in z.split() if not word.startswith('http')]))

    dataset['text'] = dataset['text'].apply(lambda z: remove_abb(z))
    docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num)] for doc in docs]

In [102]:
load_model = spacy.load("en_core_web_md")
load_model.Defaults.stop_words.add("amp")
docs = load_model.pipe(dataset.text)

docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num)] for doc in docs]

dataset['text'] = docs_clean

dataset['text'] = dataset['text'].apply(lambda z : " ".join(z))

In [103]:
import io
import re
import string
import tqdm
import tensorflow as tf
from tensorflow.keras import layers


In [6]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [104]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [8]:
sentence = "The wide road shimmered in the hot sun"
tokens = sentence.lower().split()

In [9]:
tokens

['the', 'wide', 'road', 'shimmered', 'in', 'the', 'hot', 'sun']

In [10]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [11]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [12]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [13]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence ,
      vocabulary_size = vocab_size,
      window_size = window_size,
      negative_samples = 0)
print(len(positive_skip_grams))

26


In [14]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(3, 1): (road, the)
(5, 3): (in, road)
(6, 5): (hot, in)
(5, 4): (in, shimmered)
(7, 1): (sun, the)


In [15]:
# negative sampling for one skipgram
target_word, context_word = positive_skip_grams[0]
num_ns = 4
context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))


In [16]:
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes = context_class,  # class that should be sampled as 'positive'
    num_true = 1,  # each positive skip-gram has 1 positive context class
    num_sampled = num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)

In [17]:
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['wide', 'the', 'shimmered', 'road']


In [18]:
negative_sampling_candidates

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([2, 1, 4, 3], dtype=int64)>

In [19]:
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)


In [20]:
# Concatenate a positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

In [21]:
# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape the target to shape `(1,)` and context and label to `(num_ns+1,)`.
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [22]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 3
target_word     : road
context_indices : [1 2 1 4 3]
context_words   : ['the', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


In [23]:
# sampling_table[i] denotes the probability of sampling the i-th most 
# common word in a dataset. The function assumes a Zipf's distribution of the word frequencies for sampling.
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)


[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [105]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [106]:
dataset.text

0                                          citizen
1                               proceed hear speak
2                                      speak speak
3                                          citizen
4                               resolve die famish
                           ...                    
32755                                  fast asleep
32756                                      antonio
32757                              noble sebastian
32758    thou let'st thy fortune sleep die wink'st
32759                         whiles thou art wake
Name: text, Length: 32760, dtype: object

In [127]:
tokens = dataset.text[0].lower().split()
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'first': 1, 'citizen:': 2}


In [128]:
for item in dataset.text:
    tokenz = item.lower().split()
    for token in tokenz:
        if token not in vocab:
            vocab[token] = index
            index += 1

In [129]:
inverse_vocab = {index: token for token, index in vocab.items()}

In [130]:
dataset['text_tokenized'] = dataset['text'].apply(lambda x: [vocab[x] for x in x.lower().split()])

In [134]:
vocab_size = max(vocab.values())+ 1

In [136]:
targets , contexts , labels = generate_training_data(sequences=dataset['text_tokenized'], window_size=window_size, num_ns=5, vocab_size=vocab_size, seed=SEED)

100%|██████████| 32761/32761 [00:37<00:00, 875.31it/s] 


In [137]:
targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

In [138]:
from keras.layers import Embedding

In [139]:
class Word2Vec(tf.keras.Model):
    def __init__(self , vocab_size , embedding_dim):
        super(Word2Vec , self).__init__()
        self.target_embedding = Embedding( vocab_size , embedding_dim , input_length = 1 , name = "w2v_embedding")
        self.context_embedding = Embedding( vocab_size , embedding_dim , input_length = 5 , name = "w2v_embedding")

    def call(self , pair):
        target , context = pair
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        #word_emb * context_emb T
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        return dots

In [140]:
embedding_dim = 128
model = Word2Vec(vocab_size , embedding_dim)
model.compile(optimizer = "adam" ,                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True)
 , metrics=['accuracy'])

In [141]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 6), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 6), dtype=tf.int64, name=None))>
<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 6), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 6), dtype=tf.int64, name=None))>


In [143]:
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x10a1e4c4c70>

In [144]:
weights = model.get_layer('w2v_embedding').get_weights()[0]
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()