In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Setup the extraction of QET (query execution time)
qet_dir = "/home/lo/Documents/Scriptie/anserini/results/search_logs/"
que_dir = "/home/lo/Documents/Scriptie/anserini/topics/"
slr_dir = "slr/"
bm25_dir = "bm25/"
den_dir = "dense/"
ziph_dir = "zipf/"

# Setup the extraction of QL (query length)
word_que = "04.testset_num_query_lower"
slr_que = "04.testset_num_query_lower_glove_stop_lucene_remove_unk_max_len_1500.tsv_reprs.tsv"
den_que = "04.testset_num_query_lower_simulated_dense.tsv"
ziph_que = "zipfian_robust04_slr_topics.tsv"

def get_QET_dict(log_dir):
    QET_dict = {}

    for file in os.listdir(qet_dir + log_dir):
        print(qet_dir + log_dir + file)
        for line in open(qet_dir + log_dir + file, "r"):
            if line.find("thread-") == -1:
                continue
            tup = tuple(line.split("-")[-1][1:].split(" "))
            if len(tup) == 2:
                old_QETs = []
                try:
                    old_QETs = QET_dict[tup[0]]
                except:
                    pass
                old_QETs.append(int(tup[1][:tup[1].find("m")]))
                QET_dict[tup[0]] = old_QETs
    
    return QET_dict

def get_slr_query_len(file, slr=False):
    QL_dict = {}
    for line in open(que_dir + file, "r"):
        qid, que = line.split("\t")
        
        QL_dict[qid] = len([t for t in que.split(" ") if t != "0.0" and t != "0.0000000"]) if slr else len(que.split(" "))
    
    return QL_dict

def best_fit(X, Y):
    xbar = sum(X)/len(X)
    ybar = sum(Y)/len(Y)
    n = len(X) # or len(Y)

    numer = sum([xi*yi for xi,yi in zip(X, Y)]) - n * xbar * ybar
    denum = sum([xi**2 for xi in X]) - n * xbar**2

    b = numer / denum
    a = ybar - b * xbar

    print('best fit line:\ny = {:.2f} + {:.2f}x'.format(a, b))

    return a, b

In [3]:
slr_QETs = get_QET_dict(slr_dir)
slr_QLs = get_slr_query_len(slr_que, slr=True)
ziph_QETs = get_QET_dict(ziph_dir)
ziph_QLs = get_slr_query_len(ziph_que, slr=True)
bm25_QETs = get_QET_dict(bm25_dir)
bm25_QLs = get_slr_query_len(word_que)
den_QETs = get_QET_dict(den_dir)
den_QLs = get_slr_query_len(den_que, slr=True)

/home/lo/Documents/Scriptie/anserini/results/search_logs/slr/robust04_slr_searchlog_qets_02.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/slr/robust04_slr_searchlog_qets_01.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/slr/robust04_slr_searchlog_qets_03.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/zipf/robust01_zipf_searchlog_qets_01.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/zipf/robust01_zipf_searchlog_qets_02.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/zipf/robust01_zipf_searchlog_qets_04.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/zipf/robust01_zipf_searchlog_qets_03.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/bm25/robust04_bm25_searchlog_qets_01.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/bm25/robust04_bm25_searchlog_qets_02.txt
/home/lo/Documents/Scriptie/anserini/results/search_logs/bm25/robust04_bm25_searchlog_qets_03.txt
/home/lo/Documents/Scripti

In [4]:
def plot_QET_QL(QET, QS, factor=1000):
    x_values = []
    y_values = []

    for key in QET.keys():
        x_values.append(QS[key])
        y_values.append(np.average(QET[key]) / factor)

    plt.scatter(x_values, y_values)
    plt.plot(np.unique(x_values), np.poly1d(np.polyfit(x_values, y_values, 1))(np.unique(x_values)))
    plt.xlabel("SLR active dim.")
    plt.ylabel("avg. QET in miliseconds")
    plt.show()
    return best_fit(x_values, y_values)

def print_QET_stat(QET, model_name="no-model"):
    values = []
    
    for key in QET.keys():
        values.append(np.average(QET[key]))
        
    avg = np.average(values)
    std = np.std(values)
    
    if avg / 60000000 > 1: # micro second to minute
        print("[" + model_name + "] \t QET(avg, std): (" + str(avg / 60000000) + "min, " + str(std / 60000000) + "min)" \
             + " n=" + str(len(values)))
    elif avg / 1000000 > 1: # micro second to second
        print("[" + model_name + "] \t QET(avg, std): (" + str(avg / 1000000) + "s, " + str(std / 1000000) + "s)" \
             + " n=" + str(len(values)))
    elif avg / 1000 > 1: # micro second to milli second
        print("[" + model_name + "] \t QET(avg, std): (" + str(avg / 1000) + "ms, " + str(std / 1000) + "ms)" \
             + " n=" + str(len(values)))
        
def print_QL_stat(QL, model_name="no-model"):
    values = []
    
    for key in QL.keys():
        values.append(np.average(QL[key]))
        
    avg = np.average(values)
    std = np.std(values)
    
    print("[" + model_name + "] \t QL(avg, std): (" + str(avg) + "t, " + str(std) + "t)")
    
# plot_QET_QL(slr_QETs, slr_QLs)
# plot_QET_QL(bm25_QETs, bm25_QLs)
# plot_QET_QL(den_QETs, den_QLs)

print_QET_stat(slr_QETs, "slr")
print_QET_stat(ziph_QETs, "slr_ziph")
print_QET_stat(bm25_QETs, "bm25")
print_QET_stat(den_QETs, "dense")

print_QL_stat(slr_QLs, "slr")
print_QL_stat(ziph_QLs, "slr_ziphian")
print_QL_stat(bm25_QLs, "bm25")
print_QL_stat(den_QLs, "dense")



[slr] 	 QET(avg, std): (28.981505113333338s, 7.379136121647301s) n=250
[slr_ziph] 	 QET(avg, std): (3.5794327088353417s, 0.23158692212561793s) n=249
[bm25] 	 QET(avg, std): (9.120031999999998ms, 7.418485160243551ms) n=250
[dense] 	 QET(avg, std): (3.5756492701466667min, 0.07425655223650961min) n=250
[slr] 	 QL(avg, std): (404.568t, 77.93831776475548t)
[slr_ziphian] 	 QL(avg, std): (390.52208835341366t, 19.197151783601534t)
[bm25] 	 QL(avg, std): (2.936t, 0.9736036154411095t)
[dense] 	 QL(avg, std): (5000.0t, 0.0t)
