### In this session we'll try to implement word2vec NN for arabic text.

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile
import time
from collections import Counter
import random
from math import sin
from bokeh.io import output_file, show , output_notebook
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import plasma
from bokeh.plotting import figure
import tensorflow as tf

In [2]:
import re
from collections import Counter

def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    #text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    
    # Remove all words with  5 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] >= 1]

    return trimmed_words

def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: A list where each item is a tuple of (batch of input, batch of target).
    """
    n_batches = int(len(int_text) / (batch_size * seq_length))

    # Drop the last few characters to make only full batches
    xdata = np.array(int_text[: n_batches * batch_size * seq_length])
    ydata = np.array(int_text[1: n_batches * batch_size * seq_length + 1])

    x_batches = np.split(xdata.reshape(batch_size, -1), n_batches, 1)
    y_batches = np.split(ydata.reshape(batch_size, -1), n_batches, 1)

    return list(zip(x_batches, y_batches))


def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: A tuple of dicts.  The first dict....
    """
    word_counts = Counter(words)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

In [3]:
with open('./datasets/ara.txt') as f:
    text = f.read()

In [4]:
words = preprocess(text)

In [5]:
total_counts = Counter()

In [6]:
for i in range(len(words)):
    for word in words[i].split(" "):
        total_counts[word] += 1

In [7]:
total_counts.most_common()

[('من', 14529),
 ('في', 13376),
 ('إلى', 6927),
 ('الرب', 6527),
 ('على', 5831),
 ('لا', 4895),
 ('الله', 4608),
 ('كل', 4367),
 ('أن', 3625),
 ('الذي', 3487),
 ('ولا', 2991),
 ('ما', 2887),
 ('قد', 2723),
 ('إسرائيل', 2520),
 ('الذين', 2369),
 ('هذا', 2203),
 ('إن', 2082),
 ('الأرض', 2076),
 ('هو', 2025),
 ('يا', 2013),
 ('التي', 2012),
 ('كان', 1974),
 ('له', 1962),
 ('لم', 1948),
 ('عن', 1721),
 ('الملك', 1662),
 ('فقال', 1589),
 ('ثم', 1578),
 ('قال', 1569),
 ('و', 1558),
 ('أيضا', 1462),
 ('لأن', 1458),
 ('جميع', 1406),
 ('بيت', 1341),
 ('به', 1335),
 ('أنا', 1330),
 ('ذلك', 1324),
 ('ومن', 1285),
 ('لهم', 1281),
 ('بن', 1262),
 ('وقال', 1220),
 ('مع', 1180),
 ('وكان', 1160),
 ('بل', 1147),
 ('ملك', 1138),
 ('حتى', 1137),
 ('لأنه', 1132),
 ('اليوم', 1119),
 ('بني', 1114),
 ('أو', 1069),
 ('كما', 1051),
 ('الأصحاح', 1050),
 ('هذه', 1035),
 ('الشعب', 1027),
 ('عند', 1026),
 ('لكم', 1009),
 ('هكذا', 1003),
 ('لي', 993),
 ('يهوذا', 970),
 ('الى', 969),
 ('داود', 957),
 ('بعد', 953),
 

In [8]:
print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))

Total words: 578413
Unique words: 74985


In [9]:
vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

In [10]:
int_to_vocab

{0: 'من',
 1: 'في',
 2: 'إلى',
 3: 'الرب',
 4: 'على',
 5: 'لا',
 6: 'الله',
 7: 'كل',
 8: 'أن',
 9: 'الذي',
 10: 'ولا',
 11: 'ما',
 12: 'قد',
 13: 'إسرائيل',
 14: 'الذين',
 15: 'هذا',
 16: 'إن',
 17: 'الأرض',
 18: 'هو',
 19: 'يا',
 20: 'التي',
 21: 'كان',
 22: 'له',
 23: 'لم',
 24: 'عن',
 25: 'الملك',
 26: 'فقال',
 27: 'ثم',
 28: 'قال',
 29: 'و',
 30: 'أيضا',
 31: 'لأن',
 32: 'جميع',
 33: 'بيت',
 34: 'به',
 35: 'أنا',
 36: 'ذلك',
 37: 'ومن',
 38: 'لهم',
 39: 'بن',
 40: 'وقال',
 41: 'مع',
 42: 'وكان',
 43: 'بل',
 44: 'ملك',
 45: 'حتى',
 46: 'لأنه',
 47: 'اليوم',
 48: 'بني',
 49: 'أو',
 50: 'كما',
 51: 'الأصحاح',
 52: 'هذه',
 53: 'الشعب',
 54: 'عند',
 55: 'لكم',
 56: 'هكذا',
 57: 'لي',
 58: 'يهوذا',
 59: 'الى',
 60: 'داود',
 61: 'بعد',
 62: 'ان',
 63: 'وكل',
 64: 'موسى',
 65: 'واحد',
 66: 'ولم',
 67: 'إلا',
 68: 'عليه',
 69: 'هناك',
 70: 'يكون',
 71: 'وما',
 72: 'رب',
 73: 'يسوع',
 74: 'أنت',
 75: 'أمام',
 76: 'إذا',
 77: 'وأما',
 78: 'قائلا',
 79: 'لك',
 80: 'يوم',
 81: 'وفي',
 82: 'أور

# Subsampling by Mikolov

In [11]:
threshold = 1e-3
word_counts = Counter(int_words)
total_count = len(int_words)
print(word_counts, total_count)

Counter({0: 14529, 1: 13376, 2: 6927, 3: 6527, 4: 5831, 5: 4895, 6: 4608, 7: 4367, 8: 3625, 9: 3487, 10: 2991, 11: 2887, 12: 2723, 13: 2520, 14: 2369, 15: 2203, 16: 2082, 17: 2076, 18: 2025, 19: 2013, 20: 2012, 21: 1974, 22: 1962, 23: 1948, 24: 1721, 25: 1662, 26: 1589, 27: 1578, 28: 1569, 29: 1558, 30: 1462, 31: 1458, 32: 1406, 33: 1341, 34: 1335, 35: 1330, 36: 1324, 37: 1285, 38: 1281, 39: 1262, 40: 1220, 41: 1180, 42: 1160, 43: 1147, 44: 1138, 45: 1137, 46: 1132, 47: 1119, 48: 1114, 49: 1069, 50: 1051, 51: 1050, 52: 1035, 53: 1027, 54: 1026, 55: 1009, 56: 1003, 57: 993, 58: 970, 59: 969, 60: 957, 61: 953, 62: 911, 63: 908, 64: 904, 65: 870, 66: 857, 67: 857, 68: 852, 69: 825, 70: 804, 71: 802, 72: 791, 73: 785, 74: 784, 75: 783, 76: 782, 77: 774, 78: 774, 79: 766, 80: 760, 81: 760, 82: 755, 83: 747, 84: 746, 85: 725, 86: 707, 87: 707, 88: 694, 89: 694, 90: 691, 91: 688, 92: 687, 93: 685, 94: 677, 95: 672, 96: 670, 97: 662, 98: 637, 99: 636, 100: 630, 101: 625, 102: 620, 103: 614, 10

In [12]:
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

In [13]:
len(train_words)

495072

In [14]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)

In [15]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

In [16]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

In [17]:
n_vocab = len(int_to_vocab)
n_embedding = 300 # Number of embedding features 
with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

In [18]:
# Number of negative labels to sample
n_sampled = 100
with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_vocab))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_vocab)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [19]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [20]:
# If the checkpoints directory doesn't exist:
!mkdir checkpoints

mkdir: cannot create directory ‘checkpoints’: File exists


In [21]:
%time
epochs = 40
batch_size = 1000
window_size = 3

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        batches = get_batches(train_words, batch_size, window_size)
        start = time.time()
        for x, y in batches:
            
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            if iteration % 1000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = int_to_vocab[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = int_to_vocab[nearest[k]]
                        log = '%s %s,' % (log, close_word)
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs
Epoch 1/40 Iteration: 100 Avg. Training loss: 6.8189 0.4252 sec/batch
Epoch 1/40 Iteration: 200 Avg. Training loss: 6.8099 0.3933 sec/batch
Epoch 1/40 Iteration: 300 Avg. Training loss: 6.4845 0.4103 sec/batch
Epoch 1/40 Iteration: 400 Avg. Training loss: 6.7275 0.3957 sec/batch
Epoch 2/40 Iteration: 500 Avg. Training loss: 6.6563 0.0199 sec/batch
Epoch 2/40 Iteration: 600 Avg. Training loss: 6.1730 0.4567 sec/batch
Epoch 2/40 Iteration: 700 Avg. Training loss: 5.9828 0.4049 sec/batch
Epoch 2/40 Iteration: 800 Avg. Training loss: 5.6009 0.3937 sec/batch
Epoch 2/40 Iteration: 900 Avg. Training loss: 5.6614 0.4037 sec/batch
Epoch 3/40 Iteration: 1000 Avg. Training loss: 5.4919 0.0376 sec/batch
Nearest to أن: الساكنة, تدعوني, لحمنا, فطام, وانتظرونا, اخلاقا, ثدياها, الرب,
Nearest to اليوم: وارتعبت, سفر, اسطاعوا, وقدمت, وربطوا, تقرب, فسال, ذلك,
Nearest to إسرائيل: أسلمته, تؤمنوا, نحث, واللائي, أدركهم, وكل, الملتوي, ليمطر,
Neare

Epoch 10/40 Iteration: 4700 Avg. Training loss: 3.1203 0.4345 sec/batch
Epoch 10/40 Iteration: 4800 Avg. Training loss: 3.0042 0.4399 sec/batch
Epoch 10/40 Iteration: 4900 Avg. Training loss: 3.1709 0.4489 sec/batch
Epoch 11/40 Iteration: 5000 Avg. Training loss: 2.9698 0.2207 sec/batch
Nearest to أن: أشراطها, وانتظرونا, لحمنا, الأعظم, يشفوه, فطام, الساكنة, بنيته,
Nearest to اليوم: وارتعبت, اسطاعوا, سفر, تحلني, وقدمت, خاصتهم, وحجارته, والفأر,
Nearest to إسرائيل: نحث, والمشتاق, أبذلك, ليمطر, أدركهم, واصطنعتك, أسلمته, سيعطى,
Nearest to يقول: الحسنة, عقلك, ولبنيامين, ملتهبة, بأسرع, إمير, وحاصروها, أجعلنا,
Nearest to ولكن: طأطأ, ثورهم, نعطيه, عدت, ومنافع, بكرك, شعائري, أداه,
Nearest to عشر: والعشرين, أعوزكم, وعشر, والثلاثون, المجمرة, لينصرف, مصغ, السادس,
Nearest to إلا: خسرها, فليعطها, اخرجي, والمحروم, بآياتنا, شركاءكم, وأفرح, يعزونها,
Nearest to ومن: الصادق, فجاءت, يدرون, ونستفتحها, لينذر, ويهدي, السرياني, الشهير,
Nearest to ضرب: عسيئيل, والمرتعد, أليعام, فيحكم, كالغوغاء, ويلك, ليحلف, سخر

Nearest to أن: وانتظرونا, أشراطها, فيبني, يشفوه, سئل, الأعظم, لحمنا, رشيد,
Nearest to اليوم: وارتعبت, خاصتهم, وحجارته, تحلني, والفأر, سفر, اسطاعوا, تؤثر,
Nearest to إسرائيل: نحث, وأشتأول, أبذلك, والمشتاق, كناموس, للتسعة, للشبكتين, أدركهم,
Nearest to يقول: ولبنيامين, بأسرع, طرقكم, فأحضرها, استبدلوا, وعدوكم, وشجعه, ومعيشة,
Nearest to ولكن: بكرك, ومنافع, ليبلو, كمبشر, غاوين, ثورهم, اطرحوا, تخطه,
Nearest to عشر: والعشرين, وعشر, قائلالو, والثلاثون, السادس, أعوزكم, الحادي, للشهر,
Nearest to إلا: فليعطها, خسرها, اخرجي, والمحروم, سألتهم, ويمنيهم, منقلبا, وينطقون,
Nearest to ومن: ونستفتحها, الصادق, نصبوه, السرياني, حظياتك, بحسن, فجاءت, ويهدي,
Nearest to ضرب: سخرة, عسيئيل, والمرتعد, أليعام, الماكيريين, ظفر, ويتعد, بالبنين,
Nearest to شعبا: مطروحين, يدعونك, فليقاتل, مبعدون, تمييزا, تكهن, تحاربها, لرؤساء,
Nearest to فكانت: لجند, ورزقكم, الرابعة, الساكنين, داريوس, معتنفا, كار, يشقه,
Nearest to الفصح: فيخدمه, الباكورات, عيد, الفطير, ومشيريك, لعماسا, والمغنون, للبر,
Nearest to احد: عضوا, فيتقدس, المو

Epoch 27/40 Iteration: 13100 Avg. Training loss: 2.1484 0.4282 sec/batch
Epoch 27/40 Iteration: 13200 Avg. Training loss: 1.9835 0.4179 sec/batch
Epoch 27/40 Iteration: 13300 Avg. Training loss: 2.3312 0.4220 sec/batch
Epoch 28/40 Iteration: 13400 Avg. Training loss: 2.1542 0.1514 sec/batch
Epoch 28/40 Iteration: 13500 Avg. Training loss: 2.2070 0.4227 sec/batch
Epoch 28/40 Iteration: 13600 Avg. Training loss: 2.1343 0.4222 sec/batch
Epoch 28/40 Iteration: 13700 Avg. Training loss: 1.9364 0.4303 sec/batch
Epoch 28/40 Iteration: 13800 Avg. Training loss: 2.3699 0.4176 sec/batch
Epoch 29/40 Iteration: 13900 Avg. Training loss: 2.1061 0.1663 sec/batch
Epoch 29/40 Iteration: 14000 Avg. Training loss: 2.2036 0.4248 sec/batch
Nearest to أن: سئل, أشراطها, فيبني, يشفوه, وانتظرونا, كمولود, تحاكموا, لحمنا,
Nearest to اليوم: وارتعبت, والفأر, خاصتهم, تحلني, وحجارته, لتعملها, الاسكندر, تحبلين,
Nearest to إسرائيل: وأشتأول, نحث, إخوتهم, للشبكتين, أبذلك, بعبيد, أدركهم, وقتما,
Nearest to يقول: ولبنيامي

Epoch 35/40 Iteration: 17300 Avg. Training loss: 2.2404 0.4204 sec/batch
Epoch 36/40 Iteration: 17400 Avg. Training loss: 1.8948 0.3174 sec/batch
Epoch 36/40 Iteration: 17500 Avg. Training loss: 2.1270 0.4180 sec/batch
Epoch 36/40 Iteration: 17600 Avg. Training loss: 1.9047 0.4150 sec/batch
Epoch 36/40 Iteration: 17700 Avg. Training loss: 1.9446 0.4271 sec/batch
Epoch 36/40 Iteration: 17800 Avg. Training loss: 2.1847 0.4249 sec/batch
Epoch 37/40 Iteration: 17900 Avg. Training loss: 1.9033 0.3367 sec/batch
Epoch 37/40 Iteration: 18000 Avg. Training loss: 2.1027 0.4236 sec/batch
Nearest to أن: فيبني, يشفوه, سئل, أشراطها, أعني, كمولود, ثوداس, رشيد,
Nearest to اليوم: الاسكندر, صبي, والفأر, لتعملها, وارتعبت, وحجارته, تحبلين, تحلني,
Nearest to إسرائيل: بعبيد, نحث, وأشتأول, إخوتهم, أبذلك, للشبكتين, وتراءيا, أدركهم,
Nearest to يقول: ولبنيامين, زبوب, أشفيك, أعرفكن, بأسرع, طرقكم, فأحضرها, نصيبها,
Nearest to ولكن: بكرك, وطب, يستحقني, اطرحوا, ينموا, برأي, ليبلو, كمبشر,
Nearest to عشر: تلميذا, وطال

In [22]:
with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    embed_mat = sess.run(embedding)

INFO:tensorflow:Restoring parameters from checkpoints/text8.ckpt


In [23]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label

In [24]:
%time
viz_words = 10000
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])

output_notebook()
desc = []
for idx in range(viz_words):
    desc.append(int_to_vocab[idx])
    
list_x = list(embed_tsne[0:viz_words, 0])
list_y = list(embed_tsne[0:viz_words, 1])

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc)) 
hover = HoverTool(tooltips=[
    ('desc', '@desc'),
])
mapper = LinearColorMapper(palette=plasma(256), low=min(list_y), high=max(list_y))

p = figure(plot_width=1000, plot_height=1000, tools=[hover,"pan,wheel_zoom,box_zoom,reset,resize"], 
           title="Word2Vec")
p.circle('x', 'y', size=10, source=source)

labels = LabelSet(x='x', y='y', text='desc', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas')

p.add_layout(labels)
show(p)


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 22.4 µs
