In [71]:
def __get_documents(path):
    documents = {}
    doc_ids = []

    with open(path) as f:
        content = f.readlines()
        for line in content:
            values = line.split("\t", 1)
            id = values[0]
            text = values[1]
            documents[id] = text
            doc_ids.append(id)
    return documents, doc_ids

def __get_queries(path):
    queries = {}
    query_ids = []

    with open(path) as f:
        content = f.readlines()
        for line in content:
            values = line.split("\t", 1)
            id = values[0]
            text = values[1]
            queries[id] = text
            query_ids.append(id)
    return queries, query_ids

def __get_ratings(path):
    ratings = {}

    with open(path) as f:
        content = f.readlines()
        for line in content:
            values = line.split("\t")
            query = values[0]
            text = values[2]
            rating = float(values[3])

            if query in ratings.keys():
                ratings[query][text] = rating
            else:
                ratings[query] = {text: rating}

    return ratings

### Merge queries data 

In [74]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

path_queries_train = "./all_queries/train.all.queries"
path_queries_dev = "./all_queries/dev.all.queries"
path_queries_merged = "./all_queries/merged.all.queries"

queries_train, query_ids = __get_queries(path_queries_train)

print("Amount of queries in train: \t"+str(len(queries_train)))
        
queries_dev, query_ids = __get_queries(path_queries_dev)
        
print("Amount of queries in dev: \t"+str(len(queries_dev)))

queries_train.update(queries_dev)

print("Amount of queries: \t"+str(len(queries_train)))

with open(path_queries_merged, mode='w') as merge_file:
    for key in queries_train.keys():
        merge_file.write(key+"\t"+queries_train[key])
        
queries_merged, query_ids = __get_queries(path_queries_merged)

print("Amount of queries merged: \t"+str(len(queries_merged)))


Amount of queries in train: 	2594
Amount of queries in dev: 	325
Amount of queries: 	2919
Amount of queries merged: 	2919


### Merge docs data

In [75]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

path_docs_train = "./all_docs/train.docs"
path_docs_dev = "./all_docs/dev.docs"
path_docs_merged = "./all_docs/merged.docs"

docs_train, doc_ids = __get_documents(path_docs_train)

print("Amount of docs in train: \t"+str(len(docs_train)))
        
docs_dev, doc_ids = __get_documents(path_docs_dev)
        
print("Amount of docs in dev: \t"+str(len(docs_dev))+"\n")

docs_train.update(docs_dev)

print("Amount of docs: \t"+str(len(docs_train)))

with open(path_docs_merged, mode='w') as merge_file:
    for key in docs_train.keys():
        merge_file.write(key+"\t"+docs_train[key])
        
docs_merged, query_ids = __get_documents(path_docs_merged)

print("Amount of docs merged: \t"+str(len(docs_merged)))


Amount of docs in train: 	3612
Amount of docs in dev: 	3193

Amount of docs: 	3626
Amount of docs merged: 	3625


### Merge ratings data

In [78]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

path_ratings_train = "./all_qrels/3-2-1/train.3-2-1.qrel"
path_ratings_dev = "./all_qrels/3-2-1/dev.3-2-1.qrel"
path_ratings_merged = "./all_qrels/3-2-1/merged.3-2-1.qrel"

ratings_train = __get_ratings(path_ratings_train)

print("Amount of ratings in train: \t"+str(len(ratings_train)))
        
ratings_dev = __get_ratings(path_ratings_dev)
        
print("Amount of ratings in dev: \t"+str(len(ratings_dev)))

ratings_train.update(ratings_dev)

print("Amount of ratings: \t"+str(len(ratings_train)))

with open(path_ratings_merged, mode='w') as merge_file:
    for query in ratings_train.keys():
        for doc in ratings_train[query].keys():
            merge_file.write(query + "\t" + "0" + "\t" + doc + "\t" + str(ratings_train[query][doc])+"\n")
        
ratings_merged = __get_ratings(path_ratings_merged)

print("Amount of ratings merged: \t"+str(len(ratings_merged)))

Amount of ratings in train: 	2594
Amount of ratings in dev: 	325
Amount of ratings: 	2919
Amount of ratings merged: 	2919
