In [10]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
import sys
import os
import numpy as np
import json
import random
import re
import pytrec_eval

In [11]:
def get_query_map(dataset,cluster):
    
    query_map = {}
    
    if (dataset == "ap"):
        if (cluster=="kmeans"):
            filename = "K-Means_Bert_AP.txt"
        else:
            filename = "Clarans_Bert_AP.txt"
    else:
        if (cluster == "kmeans"):
            filename = "K-Means_Bert_ZF.txt"
        else:
            filename = "Clarans_Bert_ZF.txt"
            
    with open(os.path.join("Rankings",filename),"r") as o:
        lines = o.readlines()
        text = " ".join(lines)
        text = text.split("\n")

        for line in text:
            words = line.split()
            if (len(words)==3):
                qno = words[0]
                doc = words[1]
                score = words[2]
                if str(int(qno)) in query_map:
                    query_map[str(int(qno))][doc] = float(score)
                else:
                    query_map[str(int(qno))] = {doc:float(score)}
                
    return query_map

In [12]:
run = get_query_map("ap","kmeans")

In [13]:
def loadScores():  
    true_scores = {}
    with open("trec12-news.tsv") as f:
        lines = f.read().splitlines()
        for line in lines:
            words = re.split(" |\t", line)
            query_id = words[0]
            doc_id = words[2]
            if query_id in true_scores:
                true_scores[query_id][doc_id] = 1
            else:
                true_scores[query_id] = {doc_id:1}
    return true_scores

In [14]:
qrels = loadScores()

In [15]:
def getQrels(qrels):
    filename = "qrels.txt"
    with open(filename,"w") as o:
        for qno in qrels:
            for doc in qrels[qno]:
                o.write(qno+" "+"0"+" "+doc+" "+"1")
                o.write("\n")

In [16]:
getQrels(qrels)

In [17]:
def get_trec_eval(query_map,name):
    
    filename = name +".txt"
    with open(filename,"w") as o:
        for qno in query_map:
            for doc in query_map[qno]:
                o.write(str(int(qno))+" "+"Q0"+" "+doc+" "+"1"+" "+ str(query_map[qno][doc]) +" "+"runidl")
                o.write("\n")

In [18]:
get_trec_eval(get_query_map("ap","kmeans"),"Trec_AP_Bert_Kmeans")
get_trec_eval(get_query_map("ap","clarans"),"Trec_AP_Bert_Clarans")
get_trec_eval(get_query_map("zf","kmeans"),"Trec_ZF_Bert_Kmeans")
get_trec_eval(get_query_map("zf","clarans"),"Trec_ZF_Bert_Clarans")

In [19]:
evaluator = pytrec_eval.RelevanceEvaluator(
    qrels, pytrec_eval.supported_measures)

In [20]:
def get_all_rankings(dataset,cluster):
    if (dataset == "ap"):
        if (cluster == "kmeans"):
            query_map = get_query_map("ap","kmeans")
            return(json.dumps(evaluator.evaluate(query_map), indent=1))
        else:
            query_map = get_query_map("ap","clarans")
            return(json.dumps(evaluator.evaluate(query_map), indent=1))
    else:
        if (cluster == "kmeans"):
            query_map = get_query_map("zf","kmeans")
            return(json.dumps(evaluator.evaluate(query_map), indent=1))
        else:
            query_map = get_query_map("zf","clarans")
            return(json.dumps(evaluator.evaluate(query_map), indent=1))

In [21]:
print(get_all_rankings("ap","kmeans"))

{
 "51": {
  "runid": 0.0,
  "num_q": 1.0,
  "num_ret": 99.0,
  "num_rel": 137.0,
  "num_rel_ret": 0.0,
  "map": 0.0,
  "gm_map": -11.512925464970229,
  "Rprec": 0.0,
  "bpref": 0.0,
  "recip_rank": 0.0,
  "iprec_at_recall_0.00": 0.0,
  "iprec_at_recall_0.10": 0.0,
  "iprec_at_recall_0.20": 0.0,
  "iprec_at_recall_0.30": 0.0,
  "iprec_at_recall_0.40": 0.0,
  "iprec_at_recall_0.50": 0.0,
  "iprec_at_recall_0.60": 0.0,
  "iprec_at_recall_0.70": 0.0,
  "iprec_at_recall_0.80": 0.0,
  "iprec_at_recall_0.90": 0.0,
  "iprec_at_recall_1.00": 0.0,
  "P_5": 0.0,
  "P_10": 0.0,
  "P_15": 0.0,
  "P_20": 0.0,
  "P_30": 0.0,
  "P_100": 0.0,
  "P_200": 0.0,
  "P_500": 0.0,
  "P_1000": 0.0,
  "relstring": 0.0,
  "recall_5": 0.0,
  "recall_10": 0.0,
  "recall_15": 0.0,
  "recall_20": 0.0,
  "recall_30": 0.0,
  "recall_100": 0.0,
  "recall_200": 0.0,
  "recall_500": 0.0,
  "recall_1000": 0.0,
  "infAP": 0.0,
  "gm_bpref": -11.512925464970229,
  "Rprec_mult_0.20": 0.0,
  "Rprec_mult_0.40": 0.0,
  "Rprec_

In [175]:
get_all_rankings("ap","clarans")

'{\n "51": {\n  "runid": 0.0,\n  "num_q": 1.0,\n  "num_ret": 99.0,\n  "num_rel": 137.0,\n  "num_rel_ret": 0.0,\n  "map": 0.0,\n  "gm_map": -11.512925464970229,\n  "Rprec": 0.0,\n  "bpref": 0.0,\n  "recip_rank": 0.0,\n  "iprec_at_recall_0.00": 0.0,\n  "iprec_at_recall_0.10": 0.0,\n  "iprec_at_recall_0.20": 0.0,\n  "iprec_at_recall_0.30": 0.0,\n  "iprec_at_recall_0.40": 0.0,\n  "iprec_at_recall_0.50": 0.0,\n  "iprec_at_recall_0.60": 0.0,\n  "iprec_at_recall_0.70": 0.0,\n  "iprec_at_recall_0.80": 0.0,\n  "iprec_at_recall_0.90": 0.0,\n  "iprec_at_recall_1.00": 0.0,\n  "P_5": 0.0,\n  "P_10": 0.0,\n  "P_15": 0.0,\n  "P_20": 0.0,\n  "P_30": 0.0,\n  "P_100": 0.0,\n  "P_200": 0.0,\n  "P_500": 0.0,\n  "P_1000": 0.0,\n  "relstring": 0.0,\n  "recall_5": 0.0,\n  "recall_10": 0.0,\n  "recall_15": 0.0,\n  "recall_20": 0.0,\n  "recall_30": 0.0,\n  "recall_100": 0.0,\n  "recall_200": 0.0,\n  "recall_500": 0.0,\n  "recall_1000": 0.0,\n  "infAP": 0.0,\n  "gm_bpref": -11.512925464970229,\n  "Rprec_mult_0.

In [176]:
get_all_rankings("zf","kmeans")

'{\n "51": {\n  "runid": 0.0,\n  "num_q": 1.0,\n  "num_ret": 99.0,\n  "num_rel": 137.0,\n  "num_rel_ret": 0.0,\n  "map": 0.0,\n  "gm_map": -11.512925464970229,\n  "Rprec": 0.0,\n  "bpref": 0.0,\n  "recip_rank": 0.0,\n  "iprec_at_recall_0.00": 0.0,\n  "iprec_at_recall_0.10": 0.0,\n  "iprec_at_recall_0.20": 0.0,\n  "iprec_at_recall_0.30": 0.0,\n  "iprec_at_recall_0.40": 0.0,\n  "iprec_at_recall_0.50": 0.0,\n  "iprec_at_recall_0.60": 0.0,\n  "iprec_at_recall_0.70": 0.0,\n  "iprec_at_recall_0.80": 0.0,\n  "iprec_at_recall_0.90": 0.0,\n  "iprec_at_recall_1.00": 0.0,\n  "P_5": 0.0,\n  "P_10": 0.0,\n  "P_15": 0.0,\n  "P_20": 0.0,\n  "P_30": 0.0,\n  "P_100": 0.0,\n  "P_200": 0.0,\n  "P_500": 0.0,\n  "P_1000": 0.0,\n  "relstring": 0.0,\n  "recall_5": 0.0,\n  "recall_10": 0.0,\n  "recall_15": 0.0,\n  "recall_20": 0.0,\n  "recall_30": 0.0,\n  "recall_100": 0.0,\n  "recall_200": 0.0,\n  "recall_500": 0.0,\n  "recall_1000": 0.0,\n  "infAP": 0.0,\n  "gm_bpref": -11.512925464970229,\n  "Rprec_mult_0.

In [177]:
get_all_rankings("zf","clarans")

'{\n "51": {\n  "runid": 0.0,\n  "num_q": 1.0,\n  "num_ret": 12.0,\n  "num_rel": 137.0,\n  "num_rel_ret": 0.0,\n  "map": 0.0,\n  "gm_map": -11.512925464970229,\n  "Rprec": 0.0,\n  "bpref": 0.0,\n  "recip_rank": 0.0,\n  "iprec_at_recall_0.00": 0.0,\n  "iprec_at_recall_0.10": 0.0,\n  "iprec_at_recall_0.20": 0.0,\n  "iprec_at_recall_0.30": 0.0,\n  "iprec_at_recall_0.40": 0.0,\n  "iprec_at_recall_0.50": 0.0,\n  "iprec_at_recall_0.60": 0.0,\n  "iprec_at_recall_0.70": 0.0,\n  "iprec_at_recall_0.80": 0.0,\n  "iprec_at_recall_0.90": 0.0,\n  "iprec_at_recall_1.00": 0.0,\n  "P_5": 0.0,\n  "P_10": 0.0,\n  "P_15": 0.0,\n  "P_20": 0.0,\n  "P_30": 0.0,\n  "P_100": 0.0,\n  "P_200": 0.0,\n  "P_500": 0.0,\n  "P_1000": 0.0,\n  "relstring": 0.0,\n  "recall_5": 0.0,\n  "recall_10": 0.0,\n  "recall_15": 0.0,\n  "recall_20": 0.0,\n  "recall_30": 0.0,\n  "recall_100": 0.0,\n  "recall_200": 0.0,\n  "recall_500": 0.0,\n  "recall_1000": 0.0,\n  "infAP": 0.0,\n  "gm_bpref": -11.512925464970229,\n  "Rprec_mult_0.