In [1]:
import os
import pandas as pd
import scipy.stats as stats

In [2]:
# Store all documents that are available in qrel files for each topic in a dictionary


def get_qrel_dict(qrel_path):
    with open(qrel_path) as q:
        qrel_line = q.readlines()

    qrel_dict = {}
    for line in qrel_line:
        line = line.split(' ')
        topic = line[0]
        if line[3].strip() == '1':
            if not topic in qrel_dict:
                qrel_dict[topic] = [line[2].strip()]
            else:
                qrel_dict[topic].append(line[2].strip())
    return qrel_dict

In [3]:
# Calculate the precision@k score for each-system-each-topic

def precision_at_k(df, topic, dict, k):
    count = 0
    df = df[:k]
    for doc in df['doc']:
        if doc in dict[topic]:
            count += 1
    return count/k

In [4]:
# Calculate the average precision for each topic in each system

def get_avg_precision(topic_df, topic, dict):
    precisions = []
    avg_precision = 0.0
    count = 0
    for i, doc in enumerate(topic_df['doc']):
        if doc in dict[topic]:
            count += 1
            precision = count / (i+1)
            precisions.append(precision)

    if precisions:
        avg_precision = sum(precisions)/len(precisions)
    return avg_precision

In [5]:
# Check whether every topic in a file have at least 100 documents

def check_100(df, topic_list):
    for topic in topic_list:
        df_topic = df[df['topic'] == topic]
        if len(df_topic) < 100:
            return False
    return True

In [6]:
# Calculate the average overall percision@10 score and MAP score

def get_scores(files, dict):

    df_precision_10 = pd.DataFrame()
    df_map = pd.DataFrame()

    for file in files:
        # print(file)
        with open(f'input/{file}') as f:
            lines = f.readlines()

        df = pd.DataFrame({"topic": [],
                            "doc": [],
                            "ranking": [],
                            "score": [],
                            })
        topic = list()
        doc = list()
        ranking = list()
        score = list()

        for line in lines:
            topic.append(int(line.strip().split("\t")[0]))
            doc.append(line.strip().split("\t")[2])
            ranking.append(int(line.strip().split("\t")[3]))
            score .append(float(line.strip().split("\t")[4]))

        df['topic'] = topic
        df['doc'] = doc
        df['ranking'] = ranking
        df['score'] = score

        topic_list = sorted(set(topic))

        # Check if every file have topic form 401 to 450 and every topic have 100 documents
        if (all(int(num) in topic_list for num in range(401, 451))) and check_100(df, topic_list):
            for topic in topic_list:
                df_topic = df[df['topic'] == topic]
                df_topic = df_topic.sort_values("score", ascending=False)
                # df_top10 = df_topic[:10]  # Select Top 10 for precision@10
                df_100 = df_topic[:100]  # Avg Precision for depth = 100
                precision_10 = precision_at_k(df_topic, str(topic), dict, 10)
                avg_precision = get_avg_precision(df_100, str(topic), dict)
                df_precision_10.loc[topic, file] = precision_10
                df_map.loc[topic, file] = avg_precision
                # Mean percision@10 score in the average column
            df_precision_10.loc['Mean', file] = df_precision_10[file].mean()
            df_map.loc['Mean', file] = df_map[file].mean()
        else:
            print(file, 'do not have enough data')

    return df_precision_10, df_map

In [7]:
def sig_test (df, files):
    sig_df = pd.DataFrame(columns=files, index=files)
    for row in sig_df:
        for file in files:
            if sig_df[row][file] != 'nan':
                t_stats, p_value = stats.ttest_ind(a=df[row], b=df[file] ,equal_var=True)
                sig_df[row][file] = p_value
    return sig_df

In [8]:
# Input files path
input_files = os.listdir(os.path.join(os.getcwd(), 'input'))

In [9]:
# Qrel path
qrel = os.path.join(os.getcwd(), "qrels.trec8.adhoc")
qrel_dict = get_qrel_dict(qrel)

In [10]:
# Dataframe to store all percision@10 scores (index=topic, column=system)
precision10_df, map_df = get_scores(input_files, qrel_dict)
precision_sig = sig_test(precision10_df,input_files)
maps_sig = sig_test(map_df, input_files)

In [11]:
print('----------Mean percision@10 scores----------')
print(precision10_df.loc['Mean'])

----------Mean percision@10 scores----------
input.acsys8alo2    0.474
input.apl8ctd       0.450
input.att99atdc     0.540
input.att99ate      0.476
input.CL99XTopt     0.692
input.Flab8as       0.486
input.fub99td       0.510
input.GE8ATDN1      0.502
input.GE8MTD2       0.602
input.ibms99c       0.470
input.INQ602        0.472
input.mds08a2       0.416
input.Mer8Adtd4     0.486
input.nttd8alx      0.476
input.orcl99man     0.722
Name: Mean, dtype: float64


In [12]:
print('----------MAP scores----------')
print(map_df.loc['Mean'])

----------MAP scores----------
input.acsys8alo2    0.455366
input.apl8ctd       0.425485
input.att99atdc     0.487752
input.att99ate      0.439029
input.CL99XTopt     0.602349
input.Flab8as       0.454345
input.fub99td       0.492832
input.GE8ATDN1      0.492043
input.GE8MTD2       0.562690
input.ibms99c       0.460350
input.INQ602        0.445300
input.mds08a2       0.394284
input.Mer8Adtd4     0.471727
input.nttd8alx      0.462806
input.orcl99man     0.648299
Name: Mean, dtype: float64


In [13]:
print('----------Significant Testing (Precision@10)----------')
print(precision_sig)

----------Significant Testing (Precision@10)----------
                 input.acsys8alo2 input.apl8ctd input.att99atdc   
input.acsys8alo2              1.0      0.702648        0.298648  \
input.apl8ctd            0.702648           1.0        0.176844   
input.att99atdc          0.298648      0.176844             1.0   
input.att99ate           0.974992      0.697148        0.342276   
input.CL99XTopt          0.000172      0.000088         0.01246   
input.Flab8as            0.847643       0.58286        0.413779   
input.fub99td            0.555426       0.35042        0.642489   
input.GE8ATDN1           0.627662      0.394659        0.536895   
input.GE8MTD2            0.034818      0.017623        0.330942   
input.ibms99c            0.946338      0.749503         0.26881   
input.INQ602             0.972527      0.719925        0.272894   
input.mds08a2            0.333498       0.58972        0.053016   
input.Mer8Adtd4          0.839775      0.565212        0.392464   
input.n

In [14]:
print('----------Significant Testing (MAPS@100)----------')
print(maps_sig)

----------Significant Testing (MAPS@100)----------
                 input.acsys8alo2 input.apl8ctd input.att99atdc   
input.acsys8alo2              1.0      0.538043         0.50944  \
input.apl8ctd            0.538043           1.0         0.22501   
input.att99atdc           0.50944       0.22501             1.0   
input.att99ate           0.747863      0.797698         0.36204   
input.CL99XTopt          0.001182      0.000234        0.016321   
input.Flab8as            0.983138       0.56732        0.512493   
input.fub99td            0.456839      0.199928        0.923418   
input.GE8ATDN1            0.41453      0.160156        0.928357   
input.GE8MTD2            0.022976      0.005801        0.131015   
input.ibms99c             0.91673      0.484529        0.586608   
input.INQ602             0.828452      0.683915        0.389178   
input.mds08a2            0.183275      0.515572        0.056187   
input.Mer8Adtd4            0.7254      0.344066        0.745296   
input.nttd8