In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/alfa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
import time 
import json
import re
from numpy import linalg as la

In [14]:
#This function will clean our text from data that is not important so that has no weight 
def clean_text(tweet):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    tweet = tweet.lower() # Transform in lowercase

    tweet = re.sub(r'@[a-zA-Z]+', '', tweet) # Here we remove the mentions in the tweet ex: @canodep
    tweet = re.sub(r"\B#([a-z0-9]{2,})(?![~!@#$%^&*()=+_`\-\|\/'\[\]\{\}]|[?.,]*\w)", '', tweet) # Here we remove the hashtags, because we will treat it later
    tweet = re.sub(r'[^\w\s]', '', tweet) # Here we remove punctuation marks
    tweet = re.sub(r'http\S+', '',tweet) # Remove http and https
    tweet = tweet.split() # Tokenize the text to get a list of terms

    tweet = [word for word in tweet if word not in stop_words]  # eliminate the stopwords
    tweet = [stemmer.stem(word) for word in tweet] # Perform stemming 
    return tweet
    


In [15]:
docs_path = 'data/tw_hurricane_data.json'
tweets_title = 'data/tweet_document_ids_map.csv'

tweets_id_title = {}

with open(tweets_title) as fp:
    lines = fp.readlines()


for l in lines:
    l = l.strip().split("\t")
    tweets_id_title[int(l[1])] =  l[0]


tweets = []
lines = []

for line in open(docs_path, 'r'):
    lines.append(line)
    #media = json.loads(line).get('entities').get('media')
    tweets.append({
        'id' : int(json.loads(line).get('id')),
        'title' : tweets_id_title[int(json.loads(line).get('id'))],
        'text': clean_text(json.loads(line).get('full_text')),
        'username' : json.loads(line).get('user').get('screen_name'),
        'date' : json.loads(line).get('created_at'),
        'hashtag' : list(map(lambda hashtag:  hashtag.get('text'),  json.loads(line).get('entities').get('hashtags'))),
        'like' : json.loads(line).get('favorite_count'),
        'rt' : json.loads(line).get('retweet_count'),
        'URL' : 'https://twitter.com/' + json.loads(line).get('user').get('screen_name') + "/status/" + str(json.loads(line).get('id'))
    }) 


In [16]:
def create_index(tweets):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)    

    for i in range(len(tweets)):
        terms = tweets[i].get("text")
        tweet_id = tweets[i].get('id')

        tweet_title = tweets_id_title[tweet_id]

        current_tweet_index = {}

        for position, term in enumerate(terms):
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list

                ## START CODE
                current_tweet_index[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_tweet_index[term] = [tweet_title, array('I', [position])] #'I' indicates unsigned int (int in Python)

        #merge the current page index with the main index
        for term, posting_page in current_tweet_index.items():
            index[term].append(posting_page)
        ## END CODE                    

    return index

In [17]:
index = create_index(tweets)

print("Index results for the term 'researcher': {}\n".format(index['researcher']))
print("First 10 Index results for the term 'research': \n{}".format(index['research'][:10]))

Index results for the term 'researcher': []

First 10 Index results for the term 'research': 
[['doc_1', array('I', [0, 1])], ['doc_220', array('I', [8])], ['doc_405', array('I', [8])], ['doc_1354', array('I', [1])], ['doc_1612', array('I', [2])], ['doc_2026', array('I', [6])], ['doc_2600', array('I', [9])], ['doc_2748', array('I', [11])], ['doc_3132', array('I', [2])], ['doc_3307', array('I', [5])]]


In [18]:
def search(query, index):
    """
    The output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = clean_text(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs = [posting[0] for posting in index[term]]
            # docs = docs Union term_docs
            docs |= set(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    return docs

In [19]:
#print("Insert your query (i.e.: Computer Science):\n")
#query = input()
#docs = search(query, index)
#top = 10

#print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

#for d_id in docs[:top]:
#    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "My house floods"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "I am scared to death, a hurricane is coming to my city"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Landfall in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Help and recovery during the hurricane disaster"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))


query = "Floodings in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))
    


Sample of 10 results out of 342 for the searched query:

page_id= 1575877018607964160 - page_title: doc_2397
page_id= 1575862653888729089 - page_title: doc_3544
page_id= 1575865666522914816 - page_title: doc_3288
page_id= 1575861762959630336 - page_title: doc_3606
page_id= 1575876590994485253 - page_title: doc_2433
page_id= 1575875350973730816 - page_title: doc_2535
page_id= 1575868580398342144 - page_title: doc_3071
page_id= 1575888829482795025 - page_title: doc_1827
page_id= 1575875563247792128 - page_title: doc_2517
page_id= 1575874815432744960 - page_title: doc_2573

Sample of 10 results out of 819 for the searched query:

page_id= 1575877018607964160 - page_title: doc_2397
page_id= 1575864531007725568 - page_title: doc_3384
page_id= 1575865977182437376 - page_title: doc_3266
page_id= 1575901764091772929 - page_title: doc_1286
page_id= 1575917821560864782 - page_title: doc_25
page_id= 1575859337167249411 - page_title: doc_3761
page_id= 1575917131564097536 - page_title: doc_73
page

In [20]:
#print("Insert your query (i.e.: Computer Science):\n")
#query = input()
#docs = search(query, index)
#top = 10

#print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

#for d_id in docs[:top]:
#    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "My house floods"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "I am scared to death, a hurricane is coming to my city"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Landfall in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Help and recovery during the hurricane disaster"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))


query = "Floodings in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))
    


Sample of 10 results out of 342 for the searched query:

page_id= 1575877018607964160 - page_title: doc_2397
page_id= 1575862653888729089 - page_title: doc_3544
page_id= 1575865666522914816 - page_title: doc_3288
page_id= 1575861762959630336 - page_title: doc_3606
page_id= 1575876590994485253 - page_title: doc_2433
page_id= 1575875350973730816 - page_title: doc_2535
page_id= 1575868580398342144 - page_title: doc_3071
page_id= 1575888829482795025 - page_title: doc_1827
page_id= 1575875563247792128 - page_title: doc_2517
page_id= 1575874815432744960 - page_title: doc_2573

Sample of 10 results out of 819 for the searched query:

page_id= 1575877018607964160 - page_title: doc_2397
page_id= 1575864531007725568 - page_title: doc_3384
page_id= 1575865977182437376 - page_title: doc_3266
page_id= 1575901764091772929 - page_title: doc_1286
page_id= 1575917821560864782 - page_title: doc_25
page_id= 1575859337167249411 - page_title: doc_3761
page_id= 1575917131564097536 - page_title: doc_73
page

In [21]:
def create_index_tfidf(tweets, num_docs):
    index = defaultdict(list)
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    title_index = defaultdict(str)
    idf = defaultdict(float)

    for tweet in tweets:
        tweet_id = tweet['id']
        terms = tweet['text']
        title = tweet['title']
        title_index[tweet_id] = title

        current_page_index = {}

        for position, term in enumerate(terms):
            try:
                current_page_index[term][1].append(position)
            except:
                current_page_index[term] = [tweet_id, array('I', [position])]
        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        for term, posting in current_page_index.items():
            tf[term].append((tweet_id, np.round(len(posting[1])) / norm, 4))
            df[term] = 1

        for term, posting in current_page_index.items():
            index[term].append(posting)

        for term in df:
            idf[term] = np.round(np.log(float(num_docs / df[term])), 4)

    return index, tf, df, idf, title_index


In [22]:
start_time = time.time()
num_documents = len(tweets)
index, tf, df, idf, title_index = create_index_tfidf(tweets, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 97.5 seconds


In [23]:
def search_tf_idf(query, index):
    terms = clean_text(query)
    docs = set()
    for term in terms:
        try:
            term_docs= [posting[0] for posting in index[term]]
            docs |= set(term_docs)
        except:
            pass
    docs = list(docs)
    return rank_documents(terms, docs, index, idf, tf, title_index)

In [24]:
def rank_documents(terms, docs, index, idf, tf, title_index):
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                doc_vectors[doc][termIndex] = idf[term] * tf[term][doc_index][1]

    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]

    doc_scores.sort(reverse=True)
    result_docs = [doc for score, doc in doc_scores]

    if len(result_docs) == 0:
        print("No documents found for the given query!")
        query = input()
        result_docs = search_tf_idf(query, index)

    return doc_scores


In [25]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
ranked_docs = search_tf_idf(query, index)
top = 10

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for d_id in ranked_docs[:top]:
        print("tweet_id= {} - page_title: {}, doc_score: {}".format(d_id[1], tweets_id_title[d_id[1]], d_id[0]))


Insert your query (i.e.: Computer Science):


Top 10 results out of 385 for the searched query:

tweet_id= 1575897763736330242 - page_title: doc_1458, doc_score: 52.00068178162995
tweet_id= 1575912174131630080 - page_title: doc_477, doc_score: 48.642183776379206
tweet_id= 1575863229548752897 - page_title: doc_3495, doc_score: 48.642183776379206
tweet_id= 1575900431573651456 - page_title: doc_1374, doc_score: 45.86029066666667
tweet_id= 1575892486484017155 - page_title: doc_1679, doc_score: 45.86029066666667
tweet_id= 1575869807953592324 - page_title: doc_2964, doc_score: 41.48219370490119
tweet_id= 1575901596680359938 - page_title: doc_1301, doc_score: 39.71617674227173
tweet_id= 1575884103056080898 - page_title: doc_2027, doc_score: 39.71617674227173
tweet_id= 1575856919931105280 - page_title: doc_3952, doc_score: 39.71617674227173
tweet_id= 1575862439358631936 - page_title: doc_3560, doc_score: 38.15806834761907


# Evaluation with Rank-Based Metrics


In [26]:
import numpy as np
import pandas as pd

In [27]:
search_results = pd.read_csv("data/evaluation_gt.csv")
search_results.head()

Unnamed: 0,doc,query_id,label
0,doc_12,1,1
1,doc_9,1,1
2,doc_18,1,1
3,doc_45,1,1
4,doc_501,1,1


In [28]:
print_result = search_results["label"].unique()
print("The ground truth of our dataset is composed of {} Relevance Levels: {}".format(len(print_result), sorted(print_result)))

The ground truth of our dataset is composed of 2 Relevance Levels: [0, 1]


In [29]:
search_results["is_relevant"] = search_results["label"].apply(lambda y: 1 if y >= 1 else 0)
search_results.head()

doc_scores = search_tf_idf("Landfall in South Carolina", index)
result_docs = []
for score, doc in doc_scores:
    if(doc in tweets_id_title.keys()):
        result_docs.append([tweets_id_title[doc], score])

results_df = pd.DataFrame(result_docs, columns=["doc_title", "predicted_relevance"])
search_results = pd.merge(search_results, results_df ,left_on='doc',right_on='doc_title', how='inner')


In [30]:
def precision_at_k(doc_score, y_score, k=10): #binary relevance, predicted relevance, k for a given query
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float

    """

    order = np.argsort(y_score)[::-1] #we get the ranking of the documents accoirding to the predicted score/ use np.argsort and [::1] to obtain the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k]) # align the binary relevance to the corresponding document / use the indexes of point 1 to sort the actual relevance label of the documents (hint: np.take).
    relevant = sum(doc_score == 1) #get number of relevant documents

    return float(relevant) / k #calculae precision at k, which is the number of relevant documents trieved at k

In [31]:
#P@5= 3/5
current_query = 1
current_query_res = search_results[search_results["query_id"] == current_query]
k = 5
print("==> Precision@{}: {}\n".format(k, precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)))
print("\nCheck on the dataset sorted by score:\n")
#current_query_res.sort_values("doc_score", ascending=False).head(k)
current_query_res.sort_values("predicted_relevance", ascending=False).head(k)

==> Precision@5: 1.0


Check on the dataset sorted by score:



  doc_score = np.take(doc_score, order[:k]) # align the binary relevance to the corresponding document / use the indexes of point 1 to sort the actual relevance label of the documents (hint: np.take).


Unnamed: 0,doc,query_id,label,is_relevant,doc_title,predicted_relevance
6,doc_82,1,1,1,doc_82,39.716177
4,doc_501,1,1,1,doc_501,34.395218
9,doc_165,1,1,1,doc_165,28.083578
7,doc_100,1,1,1,doc_100,22.125327
2,doc_18,1,1,1,doc_18,21.753446


In [32]:
k = 3
print("==> Precision@{}: {}\n".format(k, precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)))

k = 10
print("==> Precision@{}: {}\n".format(k, precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)))



==> Precision@3: 1.0

==> Precision@10: 1.0



  doc_score = np.take(doc_score, order[:k]) # align the binary relevance to the corresponding document / use the indexes of point 1 to sort the actual relevance label of the documents (hint: np.take).


#### Average Precision@K - AP@K


In [33]:
def avg_precision_at_k(doc_score, y_score, k=10): #binary relevance, predicted relevance, k for a given query
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    average precision @k : float
    """
    gtp = np.sum(doc_score == 1) #Total number of gt positives
    order = np.argsort(y_score)[::-1] #same as for precision
    doc_score = np.take(doc_score, order[:k]) #same as for precision
    ## if all documents are not relevant
    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(doc_score)):
        if doc_score[i] == 1: #only add the P@k when the doc is relevant
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / (i + 1) #calculate P@K (#docs relevant at k/k)
    return prec_at_i / gtp #return ap

In [34]:
avg_precision_at_k(np.array(current_query_res["is_relevant"]), np.array(current_query_res["predicted_relevance"]), 10)

1.0

In [35]:
# Check with 'average_precision_score' of 'sklearn' library

from sklearn.metrics import average_precision_score

k = 150
temp = current_query_res.sort_values("predicted_relevance", ascending=False).head(k)
average_precision_score(np.array(temp["is_relevant"]), np.array(temp["predicted_relevance"][:k]))

  average_precision_score(np.array(temp["is_relevant"]), np.array(temp["predicted_relevance"][:k]))


1.0

In [36]:
doc_score = np.array([1, 1, 0, 1, 0, 0, 1])
y_scores = np.array([7, 6, 5, 4, 3, 2, 1])
assert (average_precision_score(doc_score, y_scores) == avg_precision_at_k(doc_score, y_scores, 10))


In [37]:
avg_precision_at_k(np.array(current_query_res["is_relevant"]), np.array(current_query_res["predicted_relevance"]), 10)

1.0

In [38]:
current_query_res.sort_values("predicted_relevance", ascending=False).head(10)


Unnamed: 0,doc,query_id,label,is_relevant,doc_title,predicted_relevance
6,doc_82,1,1,1,doc_82,39.716177
4,doc_501,1,1,1,doc_501,34.395218
9,doc_165,1,1,1,doc_165,28.083578
7,doc_100,1,1,1,doc_100,22.125327
2,doc_18,1,1,1,doc_18,21.753446
0,doc_12,1,1,1,doc_12,20.139777
1,doc_9,1,1,1,doc_9,16.214061
5,doc_52,1,1,1,doc_52,11.465073
3,doc_45,1,1,1,doc_45,11.015286
8,doc_122,1,1,1,doc_122,7.375109


In [39]:
np.sum(current_query_res["is_relevant"])

10

In [40]:
(1 + (2 / 2) + (3 / 5) + (4 / 7) + (5 / 8) + (6 / 9)) / np.sum(current_query_res["is_relevant"])

0.4463095238095239

#### Mean Average Precision (mAP)

In [41]:
def map_at_k(search_res, k=10): #receives all the search esults dataframe containing all the queries and the results and relevances
    """
    Parameters
    ----------
    search_res: search results dataset containing:
        query_id: query id.
        doc_id: document id.
        predicted_relevance: relevance predicted through LightGBM.
        doc_score: actual score of the document for the query (ground truth).

    Returns
    -------
    mean average precision @ k : float
    """
    avp = []
    for q in search_res["query_id"].unique():  # loop over all query ids
        curr_data = search_res[search_res["query_id"] == q]  # select data for current query (get a slice of the dataframe keeping only the data related to the current query)
        avp.append(avg_precision_at_k(np.array(curr_data["is_relevant"]), 
                   np.array(curr_data["predicted_relevance"]), k))  #append average precision for current query
    return np.sum(avp) / len(avp), avp  # return mean average precision

In [42]:
map_k, avp = map_at_k(search_results, 10)
map_k

1.0

#### Mean Reciprocal Rank (MRR)

In [43]:
def rr_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    Reciprocal Rank for qurrent query
    """

    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order. As before
    doc_score = np.take(doc_score, order[
                             :k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k. As before
    if np.sum(doc_score) == 0:  # if there are not relevant doument return 0
        return 0
    return 1 / (np.argmax(doc_score == 1) + 1)  # hint: to get the position of the first relevant document use "np.argmax" (+1 because the idex starts from 0)


In [44]:
doc_score = np.array([0, 1, 0, 1, 1])
score = np.array([0.9, 0.5, 0.6, 0.7, 0.2])
rr_at_k(doc_score, score, 5)

0.5

##### Test

In [45]:
current_query = 8
current_query_res = search_results[search_results["query_id"] == current_query]
current_query_res.sort_values("predicted_relevance", ascending=False).head(10)

Unnamed: 0,doc,query_id,label,is_relevant,doc_title,predicted_relevance


In [46]:
labels = np.array(search_results[search_results['query_id'] == 8]["is_relevant"])
scores = np.array(search_results[search_results['query_id'] == 8]["predicted_relevance"])
np.round(rr_at_k(labels, scores, 10), 4)


0

In [47]:
mrr = {}
for k in [3, 5, 10]:
    RRs = []
    for q in search_results['query_id'].unique():  # loop over all query ids, get rrs for each query at each k
        labels = np.array(search_results[search_results['query_id'] == q]["is_relevant"])  # get labels for current query
        scores = np.array(search_results[search_results['query_id'] == q]["predicted_relevance"])  # get predicted score for current query
        RRs.append(rr_at_k(labels, scores, k))  # append RR for current query
    mrr[k] = np.round(float(sum(RRs) / len(RRs)), 4)  # Mean RR at current k

In [48]:
mrr

{3: 1.0, 5: 1.0, 10: 1.0}

#### NDCG - Normalized Discounted Cumulative Gain

In [49]:
def dcg_at_k(doc_score, y_score, k=10): #doc_scire are the labels (ground truth) and y_score are the system scores
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2 ** doc_score - 1  # First we calculate the upper part of the formula which is the CG (use formula 7 above) (notice it is based on the ground truth relevance)
    discounts = np.log2(np.arange(len(doc_score)) + 2)  # Compute denominator (np.arrange creates a list of numbers betweeen 0 and len(doc_score)-1), then the + 2 addresses the fact that the numbers start from 0
    return np.sum(gain / discounts)  #return dcg@k


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k) #ideal dcg
    #print(dcg_max)
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4)

In [52]:
query_id = 0
k = 10
labels = np.array(search_results[search_results['query_id'] == query_id]["label"])
scores = np.array(search_results[search_results['query_id'] == query_id]["predicted_relevance"])
ndcg_k = np.round(ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, query_id, ndcg_k))


ndcg@10 for query with query_id=0: 0


In [54]:
ndcgs = []
k = 10
for q in search_results['query_id'].unique(): # loop over all query ids
    labels = np.array(search_results[search_results['query_id'] == q]["label"]) ## get labels for current query
    scores = np.array(search_results[search_results['query_id'] == q]["predicted_relevance"]) # get predicted score for current query
    ndcgs.append(np.round(ndcg_at_k(labels, scores, k), 4)) # append NDCG for current query (round is just about decimals)

avg_ndcg = np.round(float(sum(ndcgs) / len(ndcgs)), 4)
print("Average ndcg@{}: {}".format(k, avg_ndcg))


Average ndcg@10: 1.0
