# Exercise 3: Advanced Information Retrieval Solutions

##  Question 1 - Latent Semantic Indexing

###  Question 1.a

In [1]:
# import Python matrix operations library
import numpy as np

#set M matrix using the given values.
M = [[1,1,1,1], 
     [0,1,1,1],
     [1,0,0,0],
     [0,1,0,0],
     [1,0,0,0],
     [1,0,1,2],
     [1,1,1,1],
     [1,1,1,0],
     [1,0,0,0],
     [0,2,1,1],
     [0,1,1,0]]


M = np.array(M)

# compute SVD
K, S, Dt = np.linalg.svd(M, full_matrices=False)

# Print K. **(note that values can match upto a sign + or -.)**
K

array([[-0.41291701, -0.12294407,  0.05933248, -0.03660797],
       [-0.3359611 ,  0.1962311 , -0.25246121,  0.11968319],
       [-0.07695592, -0.31917516,  0.31179369, -0.15629115],
       [-0.11909604,  0.2663899 ,  0.20432237, -0.52093504],
       [-0.07695592, -0.31917516,  0.31179369, -0.15629115],
       [-0.39922386, -0.49767812, -0.57172873,  0.04465203],
       [-0.41291701, -0.12294407,  0.05933248, -0.03660797],
       [-0.30751414, -0.01459992,  0.48607132,  0.40306708],
       [-0.07695592, -0.31917516,  0.31179369, -0.15629115],
       [-0.45505713,  0.462621  , -0.04813884, -0.40125186],
       [-0.23055822,  0.30457524,  0.17427762,  0.55935823]])

In [2]:
S

array([4.78695453, 2.31848919, 1.762346  , 0.77705263])

In [3]:
Dt

array([[-0.36838448, -0.57010731, -0.53356439, -0.50455879],
       [-0.74000417,  0.61762211,  0.0885323 , -0.25119473],
       [ 0.54948837,  0.36008671, -0.05294924, -0.75206148],
       [-0.12144645, -0.40479395,  0.83944473, -0.34165065]])




###  Question 1.b

In [4]:
# Select 2 singular values.

K_sel = K[:,0:2]
S_sel = np.diag(S)[0:2,0:2]
Dt_sel = Dt[0:2,:]

###  Question 1.c

In [5]:
# Transform query.
q = np.array([0,0,0,0,0,1,0,0,0,1,1])

#Map the query q onto the document space D as q* = qT · (K_sel · S_sel−1)
mapper = np.dot(K_sel, np.linalg.inv(S_sel))
q_trans =  np.dot( q, mapper)

# Check q_trans
q_trans

array([-0.22662409,  0.11624731])

###  Question 1.d

In [6]:
# compute cosine similarity.
    
import math

# Function for computing cosine similarity.
def cosine_similarity(v1, v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy*1.0/math.sqrt(sumxx*sumyy)


# Now extract representations of documents in the new concept space.
d1 = Dt_sel[:,0]
d2 = Dt_sel[:,1]
d3 = Dt_sel[:,2]
d4 = Dt_sel[:,3]


# compute cosine similarity.
sim_d1 = cosine_similarity(d1, q_trans)
sim_d2 = cosine_similarity(d2, q_trans)
sim_d3 = cosine_similarity(d3, q_trans)
sim_d4 = cosine_similarity(d4, q_trans)


#print Similarities
print("d1: {}  d2: {}  d3: {} d4 {} \n".format(sim_d1, sim_d2, sim_d3, sim_d4))



d1: -0.012057913278690218  d2: 0.9388827727147443  d3: 0.9524776244205609 d4 0.5931086268074783 




#### Ordering of documents:  D3 > D2 > D4 > D1




###  Question 1.e

The document ordering does not change even if d3 is dropped. Recall that all the documents in the term-document matrix can be considered as vectors in a $R^m$ dimensional vector space. Thus, since d3 has a similar magnitude and direction as d4 and d2, dropping d3 does not alter substantially the term space ( K ) and the document space ( D ) of the SVD. 



In [7]:
Mn = [[1,1,1],
      [0,1,1],
      [1,0,0],
      [0,1,0],
      [1,0,0],
      [1,0,2],
      [1,1,1],
      [1,1,0],
      [1,0,0],
      [0,2,1],
      [0,1,0]]

Mn = np.array(Mn)

# compute SVD
K, S, Dt = np.linalg.svd(Mn, full_matrices=False)

# LSI select dimensions
K_sel = K[:,0:2]
S_sel = np.diag(S)[0:2,0:2]
Dt_sel = Dt[0:2,:]

# transform query and documents
q = np.array([0,0,0,0,0,1,0,0,0,1,1])
q_trans =  np.dot( np.dot(q, K_sel), np.linalg.inv(S_sel))
d1 = Dt_sel[:,0]
d2 = Dt_sel[:,1]
d4 = Dt_sel[:,2]

# compute cosine similarity.
sim_d1 = cosine_similarity(d1, q_trans)
sim_d2 = cosine_similarity(d2, q_trans)
sim_d4 = cosine_similarity(d4, q_trans)


#print Similarities
print("d1: {}  d2: {}  d4 {} \n".format(sim_d1, sim_d2, sim_d4))

d1: 0.10125520472871084  d2: 0.9475215481378308  d4 0.6873076021729543 



To modify the term and document space we should change d3 such that it in a different direction as compared to the other vectors. For example, d3 = (0, 0, 1, 1, 2, 1, 0, 0, 2, 0, 2) changes the document ordering to d2 >d4 >d1 >d3.

In [8]:
Mn = [[1,1,0,1],
      [0,1,0,1],
      [1,0,1,0],
      [0,1,1,0],
      [1,0,2,0],
      [1,0,1,2],
      [1,1,0,1],
      [1,1,0,0],
      [1,0,2,0],
      [0,2,0,1],
      [0,1,2,0]]


Mn = np.array(Mn)

# compute SVD
K, S, Dt = np.linalg.svd(Mn, full_matrices=False)

# LSI select dimensions
K_sel = K[:,0:2]
S_sel = np.diag(S)[0:2,0:2]
Dt_sel = Dt[0:2,:]

# transform query and documents
q = np.array([0,0,0,0,0,1,0,0,0,1,1])
q_trans =  np.dot( np.dot(q, K_sel), np.linalg.inv(S_sel))
d1 = Dt_sel[:,0]
d2 = Dt_sel[:,1]
d3 = Dt_sel[:,2]
d4 = Dt_sel[:,3]

# compute cosine similarity.
sim_d1 = cosine_similarity(d1, q_trans)
sim_d2 = cosine_similarity(d2, q_trans)
sim_d3 = cosine_similarity(d3, q_trans)
sim_d4 = cosine_similarity(d4, q_trans)


#print Similarities
print("d1: {}  d2: {} d3: {}  d4 {} \n".format(sim_d1, sim_d2, sim_d3, sim_d4))

d1: 0.8574859998903472  d2: 0.9083433060184866 d3: 0.3170877874947522  d4 0.9049563233586899 



### Algebraic Interpretation:  

recall that the matrix M transforms a unit ball into an ellipsoid, and in LSI we keep only the directions with the strongest distortion. Intuitively, if we combine linearly $d_2$ and $d_4$ with a 0.5 coefficient, we’ll find a vector that is not very dissimilar from $d_3$ (i.e., the norm is almost the same, and the direction overlaps on many components). Therefore, it’s not surprising that (in this specific example) removing $d_3$ did not lead to a different ranking. Bear in mind that, with slightly different numbers, this might not be the case anymore.
In a real-world scenario with LSI (i.e., millions of documents) removing just a few documents rarely changes the ranking dramatically, because the documents we still keep into account will have high probability to contain the same concepts that are contained in the removed ones. That is to say, the resulting ellipsoid won’t change substantially.


###  Question 1.f

Use the code from previous exercise:


In [9]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from collections import Counter
nltk.download('stopwords')

stemmer = PorterStemmer()

# Tokenize, stem a document
def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(word.lower()) for word in tokens])

# Read a list of documents from a file. Each line in a file is a document
with open("bread.txt") as f:
    content = f.readlines()
original_documents = [x.strip() for x in content] 
documents = [tokenize(d).split() for d in original_documents]

# create the vocabulary
vocabulary = set([item for sublist in documents for item in sublist])
vocabulary = [word for word in vocabulary if word not in stopwords.words('english')]
vocabulary.sort()

# compute IDF, storing idf values in a dictionary
def idf_values(vocabulary, documents):
    idf = {}
    num_documents = len(documents)
    for i, term in enumerate(vocabulary):
        idf[term] = math.log(num_documents/sum(term in document for document in documents), math.e)
    return idf

# Function to generate the vector for a document (with normalisation)
def vectorize(document, vocabulary, idf):
    vector = [0]*len(vocabulary)
    counts = Counter(document)
    max_count = counts.most_common(1)[0][1]
    for i,term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term]/max_count
    return vector

# Compute IDF values and vectors
idf = idf_values(vocabulary, documents)
document_vectors = [vectorize(s, vocabulary, idf) for s in documents]

vocabulary






[nltk_data] Downloading package stopwords to /home/lucia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['art',
 'bake',
 'best',
 'book',
 'bread',
 'cake',
 'comput',
 'french',
 'london',
 'numer',
 'pastri',
 'pie',
 'quantiti',
 'recip',
 'scientif',
 'smith',
 'without']

In [10]:
## take transpose of document vectors to convert to term document matrix.
M = np.matrix.transpose(np.array(document_vectors))


## Run LSI.
K, S, Dt = np.linalg.svd(M, full_matrices=False)
K_sel = K[:,0:3]
S_sel = np.diag(S)[0:3,0:3]
Dt_sel = Dt[0:3,:]


# transform query and documents
q = np.array([0]*len(vocabulary))

#Set the term corresponding to baking = 1 (see vocabulary)
q[1] = 1
q_trans =  np.dot( np.dot(q, K_sel), np.linalg.inv(S_sel))


# Now extract representations of documents in the new concept space.
d1 = Dt_sel[:,0]
d2 = Dt_sel[:,1]
d3 = Dt_sel[:,2]
d4 = Dt_sel[:,3]
d5 = Dt_sel[:,4]


# compute cosine similarity.
sim_d1 = cosine_similarity(d1, q_trans)
sim_d2 = cosine_similarity(d2, q_trans)
sim_d3 = cosine_similarity(d3, q_trans)
sim_d4 = cosine_similarity(d4, q_trans)
sim_d5 = cosine_similarity(d5, q_trans)


#print Similarities
print("d1: {}  d2: {} d3: {}  d4 {} d5 {} \n".format(sim_d1, sim_d2, sim_d3, sim_d4, sim_d5))

d1: 0.9980518678772611  d2: -0.6577609566355869 d3: -0.00232887505296717  d4 0.7231078789682557 d5 -0.6551062911361925 



## Question 2 - Word Embeddings

###  Question 2.a

See the attached images: EPFL_embeddings_1/2/3. The terms that are similar are shown in red rectangles. These examples include: French propositions, English propositions, scientists/researchers, EPFL.


### Question 2.b

The code for finding the closest terms is here:

In [12]:
import sys
import codecs
import numpy as np


def load_embeddings(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        lines = f_in.readlines()
        lines = lines[1:]
        vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in lines])
    wv = np.loadtxt(wv)
    return wv, vocabulary


# Replace the path based on your own machine.
word_embeddings, vocabulary = load_embeddings('fastText-0.1.0/model_epfldocs.vec')

def find_most_similar(input_term, word_embeddings, vocabulary, num_terms=3):
    term_embeddings_dict = {}
    for i,term in enumerate(vocabulary):
        term_embeddings_dict[term] = word_embeddings[i]
        
    if input_term not in term_embeddings_dict:
        return "Term not in the vocabulary"
    
    input_term_embedding = term_embeddings_dict[input_term]
    term_similarities = []
    for term, embedding in term_embeddings_dict.items():
        term_similarities.append([term, cosine_similarity(input_term_embedding, embedding)])
        
    sorted_terms = sorted(term_similarities, key = lambda x: -1 * x[1])[0:num_terms]
    
    return sorted_terms

In [13]:
find_most_similar('la', word_embeddings, vocabulary, num_terms=5)

[['la', 1.0],
 ['pour', 0.9995999844122386],
 ['les', 0.9994841513525654],
 ['sur', 0.9994440687398943],
 ['faire', 0.9994296343122635]]

In [14]:
find_most_similar('EPFL', word_embeddings, vocabulary, num_terms=5)

[['EPFL', 1.0],
 ['@EPFL', 0.9998930356734843],
 ['#EPFL', 0.9998703686783511],
 ['@CHUVLausanne', 0.9998271224201326],
 ['Lausanne', 0.9998068187511153]]

In [15]:
find_most_similar('#robot', word_embeddings, vocabulary, num_terms=5)

[['#robot', 1.0],
 ['#robots', 0.9999717434996156],
 ['#robotics', 0.9999529527060248],
 ['robot', 0.999944298108067],
 ['#Robotics', 0.9998920366384801]]

In [16]:
find_most_similar('this', word_embeddings, vocabulary, num_terms=5)

[['this', 1.0],
 ['these', 0.9998446875714924],
 ['that', 0.9998405424688396],
 ['there', 0.9998339926559185],
 ['their', 0.9998318984301732]]

### Question 2.c

Left as exercise.

###  Question 2.d

See the attached images: Full_embeddings_1/2. There are many different clusters of meaningful concepts such as people names, countries, cities, diseases, websites, vehicles etc. Multiple different patterns can be found in the visualization. 