In [1]:
import numpy as np
import torch
import scipy

from numpy.linalg import norm
import multiprocessing as mp
from functools import partial
import copy


def cos_dis(u,v):
    dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))
    return dist
                
def analogy(question_got, emb): 
    
    question = sentence_to_wordlist(question_got)
    country1_emb = emb[list_of_words.index(question[0]), :]
    capital1_emb = emb[list_of_words.index(question[1]), :]
    country2_emb = emb[list_of_words.index(question[2]), :]
    target_emb = country2_emb - country1_emb + capital1_emb
    
    closest = -1
    closest_value = float("inf")
    
    for curr_word in list_of_words:
        if (curr_word not in question[:3]):
            if (cos_dis(emb[list_of_words.index(curr_word),:],target_emb) < closest_value):
                closest_value = cos_dis(emb[list_of_words.index(curr_word),:],target_emb)
                closest = list_of_words.index(curr_word)
            
    return (list_of_words[closest] == question[3])


def test_emb(questions, embedding):
    
    CPUs = mp.cpu_count()
    p = mp.Pool(CPUs)
    
    number_of_questions = len(list_of_questions)
        
    #compute in parallel
    analogy_emb=partial(analogy, emb=embedding)
    results = p.map(analogy_emb, questions)   

    return sum(results)/number_of_questions

def save_emb(V):
    import json
    dict = {}
    for i in range(number_of_words):
        dict[list_of_words[i]] = V[i, :].tolist()
    json = json.dumps(dict)
    f = open("embedding.json","w")
    f.write(json)
    f.close()

def sparse_scipy_to_sparse_torch(S):
    val = S.data
    row = S.nonzero()[0]
    col = S.nonzero()[1]
    return torch.sparse_coo_tensor(indices = torch.tensor([row,col]), values = torch.tensor(val), size=[S.get_shape()[0],S.get_shape()[1]], dtype=torch.float64)


In [2]:
#lower it's rank

def condition(V, target):
    
    #L, d, R = np.linalg.svd(V)
    L, d, R = torch.svd(V, compute_uv=False)
    
    return (d[target-1]/d[target] < 10)

def mix(V, steps, sigma):
    for i in range(steps):

        U1, d, U2 = torch.svd(V)
        S = torch.diag(d)
        
        u_r = np.c_[U1[:, r].numpy()] #r-th left-eigenvetor
        v_r = np.r_[[U2[:, r].numpy()]] #r-th right-eigenvector
        PenGrad = torch.from_numpy(np.dot(u_r, v_r))
        
        first_term = torch.mm(V, torch.mm(V.T, V))#torch.mm(V, torch.mm(torch.mm(U2, S**2), U2.T))
        C = 4*(first_term - torch.mm(Cooc_t, V))
        
        gradient = C/np.linalg.norm(C) + sigma*PenGrad/np.linalg.norm(PenGrad)
        step = 1/np.linalg.norm(gradient)
        V = V - step*(gradient) 
    return V

#Entropy Penalised Word Embeddings
def EPWE(V, steps, sigma, gamma, target_rank):
    
    while not condition(V, target_rank):
        V = mix(V, steps, sigma)
        sigma = sigma*gamma
        
    return V    
    

def cut(V, r):
    L, d, R = np.linalg.svd(V)

    dnew = copy.deepcopy(d)
    dnew[range(r, len(d))] = 0
    dmat = np.zeros((len(L), len(d)))
    dmat[:len(d), :len(d)] = np.diag(dnew)


    return np.dot(L, np.dot(dmat, R))

In [3]:
from nltk.tokenize import sent_tokenize
import re

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ",raw)
    words = clean.split()
    return words

#choose question set
#analogy = open('Analogy/analogy','r')
analogy_sem = open('Analogy/google-sem.txt','r')
analogy_syn = open('Analogy/google-syn.txt','r')

#extract all words in analogy questions
questions_sem = analogy_sem.read() 
questions_syn = analogy_syn.read()

questions = questions_sem + questions_syn
questions = questions.lower()
questions_sem = questions_sem.lower()
questions_syn = questions_syn.lower()

list_of_questions = sent_tokenize(questions)
list_of_questions_sem = sent_tokenize(questions_sem)
list_of_questions_syn =sent_tokenize(questions_syn)

number_of_questions = len(list_of_questions)

list_of_words = list(set(sentence_to_wordlist(questions)))
number_of_words = len(list_of_words)
print("Questions number: ", number_of_questions, "; Words number: ", number_of_words)

Questions number:  19544 ; Words number:  905


In [4]:
#open embedding
dimension = 50

import json
with open('Embeddings/glove-50.json', 'r') as data_file:
    glove_emb = json.loads(data_file.read())
    
#target_rank
r = 30     

In [5]:
#form a corresponding submatrix of corpus representations
W = np.zeros((number_of_words, dimension))
for word in list_of_words:
       W[list_of_words.index(word), :] = glove_emb[word]
        
#reconstruct empirical cooc

Cooc = np.zeros((number_of_words, number_of_words))
for i in range(number_of_words):
    for j in range(i):
        Cooc[i,j] = np.dot(np.r_[W[i, :]], np.c_[W[j, :]])
        Cooc[j, i] = Cooc[i,j]


Cooc_sp = (scipy.sparse.coo_matrix(Cooc))
Cooc_t = sparse_scipy_to_sparse_torch(Cooc_sp)

In [7]:
#find the best sigma by random search
V = torch.from_numpy(W)

steps = 100
sigma_best = 1
V_best = mix(V, steps, sigma_best)
bound = test_emb(list_of_questions_sem[:300], V_best)*len(list_of_questions_sem)/300

for i in range(20):
    sigma_curr = np.random.uniform(0.1, 10000)#sigma_best+1
    V_curr = mix(V, steps, sigma_curr)
    curr_bound = test_emb(list_of_questions_sem[:300], cut(V_curr, r))*len(list_of_questions_sem)/300
    
    if (curr_bound > bound):
        sigma_best = sigma_curr
        bound = curr_bound
        V_best = V_curr
                
print("best sigma: ", sigma_best)        

best sigma:  1


In [8]:
print("Results for Semantic questions:")

print("EP emb: ", test_emb(list_of_questions_sem, V_best))
print("PCA emb: ", test_emb(list_of_questions_sem, cut(W, r)))
print("Original emb: ", test_emb(list_of_questions_sem, W))

print("Results for Syntactic questions:")

print("EP emb: ", test_emb(list_of_questions_syn, V_best))
print("PCA emb: ", test_emb(list_of_questions_syn, cut(W, r)))
print("Original emb: ", test_emb(list_of_questions_syn, W))




Results for Semantic questions:
EP emb:  0.23045435939418749
PCA emb:  0.21561604584527222
Original emb:  0.23239869013507983
Results for Syntactic questions:
EP emb:  0.35468686041751946
PCA emb:  0.3286430618092509
Original emb:  0.35934302087597214


In [None]:
#compare with a random projected embedding
import math

def RandProj(start_dim, target_dim):
    
    A = np.random.normal(0, 1, (start_dim, target_dim))
    return A/math.sqrt(target_dim)

def RandProjEmb(V, target_dim):
    
    return np.dot(V, RandProj(len(V[0, :]), target_dim))


ProjEmb = RandProjEmb(W, 30)

print("Results for Semantic questions:")
print("Random Projected Embedding performance: ", test_emb(list_of_questions_sem, ProjEmb))
