<a href="https://colab.research.google.com/github/placid-brain/50.045-IR-Project-2022/blob/main/LSA_with_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import copy
sw = set(stopwords.words('english'))
import pandas as pd
import time

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df1 = pd.read_csv('drive/My Drive/IR/Info Retrieval Group Project/recipes_w_search_terms.csv', nrows=1000)

def function(ini_list):
    new_cell = ini_list.strip('][').split(', ')
    for item in new_cell:
        item = item.replace("'","")
    return new_cell

df1['ingredients'] = df1['ingredients'].apply(function)


df2 = df1[['id', 'name','ingredients','steps']]
#print(df2.head())

ingredients = df2['ingredients'].tolist()

new_lst =  [' '.join(i) for i in ingredients]

tfvec = TfidfVectorizer()
# turning ingredients-dishes(doc id) into tf-df vector
tdf = tfvec.fit_transform(new_lst).T
# singular value decomposition
[dc,cs,tc] = np.linalg.svd(tdf.toarray(), full_matrices=False)




In [None]:
class LSA:
    def __init__(self):
        pass
    def query_vector_gen(self,q_list):
        # turning query into tf-idf vector
        q = tfvec.transform(q_list)
        q = q.toarray()
        # mapping query vector to document-concept matrix and eventually concept strength matrix
        q_dc = np.matmul(q,dc)
        q_dc__s = np.matmul(q_dc,np.linalg.inv(np.diag(cs)))
        return q_dc__s
    
    def search(self, query):
        query = query.lower()
        
        query_tokens = word_tokenize(query)
        q_list = []

        for w in query_tokens:
            if w not in sw:
                q_list.append(w)
        res = " ".join(q_list)
        final_list =[]
        final_list.append(res)
        

        result = self.query_vector_gen(final_list)
        
        lst = []

        for arr in range(len(tc)):
            # mapping resultant vector to term-concept matrix
            lst.append(np.dot(result,tc[:,arr]))
        
        temp_lst = copy.deepcopy(lst)
        desc_lst = sorted(lst,reverse=True)[0:5]
        docs_lst = []
        for i in range(len(desc_lst)):
            docs_lst.append(temp_lst.index(desc_lst[i]))
            # print("{}. {}".format(i+1, df2["name"][temp_lst.index(desc_lst[i])]))
        return docs_lst

  

In [None]:

lsa = LSA()
start = time.time()
query = 'red onions extra virgin olive oil balsamic vinegar'
print(query)
for idx, i in enumerate(lsa.search(query)):
  print("{}. {}".format(idx+1, df2["name"][i]))
end = time.time()

red onions extra virgin olive oil balsamic vinegar
1. Balsamic Glazed Onions
2. The Best Soft and Chewy Sugar Cookies You Will Ever Make!!
3. Grandmother's Cheesecake
4. Egyptian Red Lentils&rsquo; Fattah
5. Beef Stew With Dried Cherries


In [None]:
print("Time taken: ", end-start)

Time taken:  0.28398966789245605


## Evaluation

In [None]:
relevance_dict={
    "red onions extra virgin olive oil balsamic vinegar":[0,15,1],
    
    'butter mushrooms flour port wine':[0,15,1],
    
    "red onions extra virgin olive oil balsamic vinegar":[2,13,14,15],
    
    'vegetable oil unsweetened applesauce eggs vanilla nuts':[9,7,4,8],
    
    'olive oil onion garlic cloves':[17,10,6,14],
    
    'red lentil onion tomatoes salt black pepper':[14,1,19]
}

In [None]:
output_docs_dict = {}
for test_query in relevance_dict.keys():
  output_docs_dict[test_query] = lsa.search(test_query)

print(output_docs_dict)

{'red onions extra virgin olive oil balsamic vinegar': [980, 821, 961, 994, 4], 'butter mushrooms flour port wine': [982, 104, 894, 268, 297], 'vegetable oil unsweetened applesauce eggs vanilla nuts': [989, 358, 249, 380, 369], 'olive oil onion garlic cloves': [74, 566, 683, 398, 864], 'red lentil onion tomatoes salt black pepper': [994, 725, 838, 521, 946]}


In [None]:
# Relevance scores
rs = []
for query in relevance_dict.keys():
  for rr_list in output_docs_dict.values():
    rr = [0]*len(rr_list)
    for idx in range(len(rr_list)):
      if rr_list[idx] in relevance_dict[query]: rr[idx] = 1
  rs.append(rr)
print((rs))

### MRR

In [None]:
def mean_reciprocal_rank(rs):
  rs = (np.asarray(r).nonzero()[0] for r in rs)
  return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

print('Mean reciprocal rank: ', mean_reciprocal_rank(rs))

Mean reciprocal rank:  0.0


### Precision and Recall

In [None]:
'''
rr_list: document id list retrived by model
docs_list: documents retrived by model
'''
def pres(query,rr_list):
    rr_val = len(list(set(relevance_dict[query]) & set(rr_list)))
    return rr_val / 5
    
def rec(query,rr_list):
    rr_val = len(list(set(relevance_dict[query]) & set(rr_list)))
    return rr_val / len(relevance_dict[query])

pres_list =[]
rec_list = []

for query in relevance_dict.keys():
  rr_list = lsa.search(query)
  pres_list.append(pres(query, rr_list))
  rec_list.append(rec(query,rr_list))

avg_pres = np.mean(pres_list)
avg_rec = np.mean(rec_list)

print("Average precision: ",avg_pres)
print("Average recall: ",avg_rec)

### MAP

In [None]:
import numpy as np

def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs):
    return np.mean([average_precision(r) for r in rs])

print('Mean average precision: ', mean_average_precision(rs))

### NDCG

In [None]:
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.

def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max