# 0. Importing

In [44]:
import pdb
import pickle
import string

import time

import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import scipy
import sklearn
from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer

from utils import (cosine_similarity, get_dict,
                   process_tweet)
from os import getcwd

In [45]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

# 1. The word embeddings data for English and French words

## Data

In [46]:
en_embeddings_subset = pickle.load(open("en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))

In [47]:
en_fr_train = get_dict('en-fr.train.txt')
en_fr_test = get_dict('en-fr.test.txt')

## 1.1 Generate embedding and transform matrices

In [48]:
def get_matrices (en_fr, french_vecs, english_vecs):

    # list for x and y for en and fr emb
    X_l = list()
    Y_l = list()

    # extract the words 
    english_set = english_vecs.keys()
    french_set = french_vecs.keys()

    # the translation of the english words (french words)
    #french_words = set(en_fr.values())

    for en_word, fr_word in en_fr.items():

        if fr_word in french_set and en_word in english_set:

            en_vec = english_vecs[en_word]
            fr_vec = french_vecs[fr_word]

            X_l.append(en_vec)
            Y_l.append(fr_vec)
    
    X =  np.vstack(X_l)
    Y = np.vstack(Y_l)

    return X, Y

In [49]:
X_train, Y_train = get_matrices(en_fr_train, fr_embeddings_subset, en_embeddings_subset)

# 2. Translations

## 2.1 Translation as linear transformation of embeddings

step 1 : compute the loss

In [50]:
def compute_loss(X, Y, R):

    m = X.shape[0]

    diff = np.dot(X,R)-Y
    diff_squared = diff**2
    sum_diff_squared = np.sum(diff_squared)

    loss = sum_diff_squared/m

    return loss

Step 2: Computing the gradient of loss in respect to transform matrix R

In [51]:
def compute_gradient (X, Y, R):

    m = X.shape[0]

    gradient = np.dot(X.transpose(),np.dot(X,R)-Y)*(2/m)

    return gradient

Step 3: Finding the optimal R with gradient descent algorithm

In [52]:
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):

    np.random.seed(129)

    R = np.random.rand(X.shape[1], X.shape[1])

    for i in range(train_steps):
        if i % 25 == 0:
            print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        
        gradient = compute_gradient(X,Y,R)

        R -=  learning_rate * gradient
        
    return R

Calculate transformation matrix R

In [53]:
R_train = align_embeddings(X_train, Y_train, train_steps=400, learning_rate=0.8)

loss at iteration 0 is: 963.0146
loss at iteration 25 is: 97.8292
loss at iteration 50 is: 26.8329
loss at iteration 75 is: 9.7893
loss at iteration 100 is: 4.3776
loss at iteration 125 is: 2.3281
loss at iteration 150 is: 1.4480
loss at iteration 175 is: 1.0338
loss at iteration 200 is: 0.8251
loss at iteration 225 is: 0.7145
loss at iteration 250 is: 0.6534
loss at iteration 275 is: 0.6185
loss at iteration 300 is: 0.5981
loss at iteration 325 is: 0.5858
loss at iteration 350 is: 0.5782
loss at iteration 375 is: 0.5735


## 2.2 Testing the translation

k-Nearest neighbors algorithm

In [54]:
def nearest_neighbor(v, candidates, k=1):

    similarity_l = []

    
    for row in candidates:
        
        cos_similarity = cosine_similarity(v,row)

        
        similarity_l.append(cos_similarity)
        
    
    sorted_ids = np.argsort(similarity_l)

    # get the indices of the k most similar candidate vectors
    k_idx = sorted_ids[-k:]
    
    return k_idx

Test your translation and compute its accuracy

In [55]:
def test_vocabulary(X, Y, R):
    
    pred = np.dot(X,R)

    num_correct = 0

    for i in range(len(pred)):
        
        pred_idx = nearest_neighbor(pred[i],Y)

        # if the index of the nearest neighbor equals the row of i... \
        if pred_idx == i:
            
            num_correct += 1

    accuracy = num_correct / len(pred)

    return accuracy

In [56]:
X_val, Y_val = get_matrices(en_fr_test, fr_embeddings_subset, en_embeddings_subset)

In [57]:
acc = test_vocabulary(X_val, Y_val, R_train) 
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.557


# 3. LSH and document search

## 3.1 Data

In [58]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_positive_tweets + all_negative_tweets

## 3.2 Getting the document embeddings

Document embeddings

In [65]:
def get_document_embedding(tweet, en_embeddings): 
    
    doc_embedding = np.zeros(300)

    processed_doc = process_tweet(tweet)
    for word in processed_doc:

        doc_embedding+=en_embeddings.get(word,0)
    return doc_embedding

Store all document vectors into a dictionary

In [67]:
def get_document_vecs(all_docs, en_embeddings):
    
    ind2Doc_dict = {}
    document_vec_l = []

    for i, doc in enumerate(all_docs):

        doc_embedding = get_document_embedding(doc,en_embeddings)

        ind2Doc_dict[i] = doc_embedding
        document_vec_l.append(doc_embedding)


    document_vec_matrix = np.vstack(document_vec_l)

    return document_vec_matrix, ind2Doc_dict

In [68]:
document_vecs, ind2Tweet = get_document_vecs(all_tweets, en_embeddings_subset)

## 3.3 Looking up the tweets.

In [69]:
my_tweet = 'i am sad'
process_tweet(my_tweet)
tweet_embedding = get_document_embedding(my_tweet, en_embeddings_subset)

In [70]:
idx = np.argmax(cosine_similarity(document_vecs, tweet_embedding))
print(all_tweets[idx])

@hanbined sad pray for me :(((


## 3.4 Finding the most similar tweets with LSH

In [75]:
N_VECS = len(all_tweets)    
N_DIMS = len(ind2Tweet[1])
N_PLANES = 10
N_UNIVERSES = 25

## 3.5 Getting the hash number for a vector

In [76]:
np.random.seed(0)
planes_l = [np.random.normal(size=(N_DIMS, N_PLANES))
            for _ in range(N_UNIVERSES)]

In [72]:
def hash_value_of_vector(v, planes):

    dot_product = np.dot(v,planes)
    sign_of_dot_product = np.sign(dot_product)

    h = sign_of_dot_product>=0
    h = np.squeeze(h)

    hash_value = 0
    n_planes = planes.shape[1]

    for i in range(n_planes):
        
        hash_value += np.power(2,i)*h[i]

    hash_value = int(hash_value)

    return hash_value

## 3.6 Creating a hash table

In [78]:
def make_hash_table(vecs, planes):

    num_of_planes = planes.shape[1]
    num_buckets = 2**num_of_planes

    hash_table = {i:[] for i in range(num_buckets)}
    id_table = {i:[] for i in range(num_buckets)}

    for i, v in enumerate(vecs):
        
        h = hash_value_of_vector(v,planes)

        hash_table[h].append(v)
        id_table[h].append(i)

    return hash_table, id_table

In [85]:
planes = planes_l[0]  # get one 'universe' of planes to test the function
tmp_hash_table, tmp_id_table = make_hash_table(document_vecs, planes)


## 3.7 Creating all hash tables


In [115]:
hash_tables = []
id_tables = []
for universe_id in range(N_UNIVERSES): 
    print('working on hash universe #:', universe_id)
    planes = planes_l[universe_id]
    hash_table, id_table = make_hash_table(document_vecs, planes)
    hash_tables.append(hash_table)
    id_tables.append(id_table)

working on hash universe #: 0
working on hash universe #: 1
working on hash universe #: 2
working on hash universe #: 3
working on hash universe #: 4
working on hash universe #: 5
working on hash universe #: 6
working on hash universe #: 7
working on hash universe #: 8
working on hash universe #: 9
working on hash universe #: 10
working on hash universe #: 11
working on hash universe #: 12
working on hash universe #: 13
working on hash universe #: 14
working on hash universe #: 15
working on hash universe #: 16
working on hash universe #: 17
working on hash universe #: 18
working on hash universe #: 19
working on hash universe #: 20
working on hash universe #: 21
working on hash universe #: 22
working on hash universe #: 23
working on hash universe #: 24


## 3.8 Approximate K-NN

In [127]:
def approximate_knn(doc_id, v, planes_l, k=1, num_universes_to_use=N_UNIVERSES):

    assert num_universes_to_use <= N_UNIVERSES

    vecs_to_consider_l = list()
    ids_to_consider_l = list()
    ids_to_consider_set = set()

    for universe_id in range(num_universes_to_use):

        planes = planes_l[universe_id]

        hash_value = hash_value_of_vector(v, planes)
        

        # bring from the hash table the vectors that its hash value is
        # the hash value of the vector v and plans of the universe (universe_id)
        hash_table = hash_tables[universe_id]
        document_vectors_l = hash_table[hash_value]

        # bring from the id tables the id of the vector that its hash vlaue
        # is the hash value of the vector v and plans of the universe (universe_id)
        id_table = id_tables[universe_id]
        new_ids_to_consider = id_table[hash_value]


        if doc_id in new_ids_to_consider:
            new_ids_to_consider.remove(doc_id)
            print(f"removed doc_id {doc_id} of input vector from new_ids_to_search")

        
        for i, new_id in enumerate(new_ids_to_consider):

           
            if new_id not in ids_to_consider_set:
                
                document_vector_at_i = document_vectors_l[i]

                vecs_to_consider_l.append(document_vector_at_i)
                ids_to_consider_l.append(new_id)

                ids_to_consider_set.add(new_id)

    print("Fast considering %d vecs" % len(vecs_to_consider_l))

    vecs_to_consider_arr = np.array(vecs_to_consider_l)

    nearest_neighbor_idx_l = nearest_neighbor(v, vecs_to_consider_arr, k=k)
    print(nearest_neighbor_idx_l)
    print(ids_to_consider_l)
  
    nearest_neighbor_ids = [ids_to_consider_l[idx]
                            for idx in nearest_neighbor_idx_l]

    return nearest_neighbor_ids


In [128]:
doc_id = 0
doc_to_search = all_tweets[doc_id]
vec_to_search = document_vecs[doc_id]

In [129]:
nearest_neighbor_ids = approximate_knn(doc_id, vec_to_search, planes_l, k=3, num_universes_to_use=5)


removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
Fast considering 77 vecs
[26  8  0]
[51, 105, 154, 160, 195, 253, 1876, 2478, 701, 1205, 1300, 1581, 1681, 1685, 2714, 4149, 4157, 4232, 4753, 5684, 6821, 9239, 213, 339, 520, 1729, 2140, 2786, 3028, 3162, 3259, 3654, 4002, 4047, 5263, 5492, 5538, 5649, 5656, 5729, 7076, 9063, 9207, 9789, 9927, 207, 254, 1302, 1480, 1815, 2298, 2620, 2741, 3525, 3837, 4704, 4871, 5327, 5386, 5923, 6033, 6371, 6762, 7288, 7472, 7774, 7790, 7947, 8061, 8224, 8276, 8892, 9096, 9153, 9175, 9323, 9740]


In [130]:
print(f"Nearest neighbors for document {doc_id}")
print(f"Document contents: {doc_to_search}")
print("")

for neighbor_id in nearest_neighbor_ids:
    print(f"Nearest neighbor at document id {neighbor_id}")
    print(f"document contents: {all_tweets[neighbor_id]}")

Nearest neighbors for document 0
Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Nearest neighbor at document id 2140
document contents: @PopsRamjet come one, every now and then is not so bad :)
Nearest neighbor at document id 701
document contents: With the top cutie of Bohol :) https://t.co/Jh7F6U46UB
Nearest neighbor at document id 51
document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)
