In [760]:
import numpy as np
import xml.etree.ElementTree as ET
import pandas as pd
import sys

In [761]:
def get_topics(tree):
    topics = []
    root = tree.getroot()
    
    for child in root:
        topics.append(child)
    
    return topics

In [762]:
def get_file_line_length(f):
    with open(f) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [763]:
# Input: topic, max number of keywords
# Output: set of strings of keywords

def get_topic_keywords_set(topic, n=5):
    f = ['\n', '、', '。', '，']
    keywords_dict = {}
    keywords = list(topic[1].text) + list(topic[2].text) + list(topic[3].text) + list(topic[4].text)
    keywords = [x for x in keywords if x not in f]

    for k in keywords:
        if str(k) in keywords_dict:
            keywords_dict[str(k)] += 1
        else:
            keywords_dict[str(k)] = 1
        
    res = sorted(keywords_dict.items(), key=lambda item: item[1])
    res = res[len(res)-n:]
    res = [x[0] for x in res]

    return res
        

In [764]:
# Input: set of keyword strings
# Output: set of keyword_id ints

def get_keywords_id_set(keywords, model_dir):
    kw_set = []
    f = open('.{}/vocab.all'.format(model_dir), 'r', encoding='utf-8')
    
    for l, w in enumerate(f.readlines()):
        w = w.strip()
        if str(w) in keywords:
            kw_set.append(l)
    
    f.close()
    
    return kw_set
        

In [765]:
# Input: keyword id
# Output: a tuple of (tf, set of doc_id strings)

def get_kw_docs_set(kw_id, model_dir):
    kw_id = str(kw_id)
    doc_set = set()
    f = open('.{}/inverted-file'.format(model_dir), 'r', encoding='utf-8')
    found = False
    a = -1
    tf = 0
    
    for i, line in enumerate(f):
        if found and i > a:
            found = False
            a = -1
        l = list(str(line).strip().split(' '))
        # word id metadata
        if len(l) == 3:
            if l[0] == kw_id or l[1] == kw_id:
                found = True
#             if l[1] == '-1':
                a = i+int(l[2])
                tf += int(l[2])
        elif len(l) == 2 and a != -1 and found:
            doc_set.add(l[0])
    
    f.close()
    
    return (tf, doc_set)

In [766]:
# Input: keyword id
# Output: a tuple of (tf, set of doc_id strings)

def get_wid_dict(model_dir):
    f = open('.{}/inverted-file'.format(model_dir), 'r', encoding='utf-8')
    res = {}
    curr = None
    
    for i, line in enumerate(f):
        
        l = list(str(line).strip().split(' '))
        
        if len(l) == 3:
            curr = int(l[0])
            if curr not in res:
                res[curr] = {}
        elif len(l) == 2:
            if int(l[0]) in res[curr]:
                res[curr][int(l[0])] += int(l[1])
            else:
                res[curr][int(l[0])] = int(l[1])
    
    f.close()
    
    return res

In [767]:
# Input: set of strings of keyword ids
# Output: list of sets of doc_id with keyword

def get_topic_kw_doc_set(topic_kw_set, model_dir):
    res = []
    
    for kw_id in topic_kw_set:
        print(kw_id)
        res.append(get_kw_docs_set(kw_id, model_dir))
        
    return res

In [768]:
def get_topics_tf_docs(topics_kw_sets, model_dir):
    res = []
    
    for t in topics_kw_sets:
        res.append(get_topic_kw_doc_set(t, model_dir))
        
    return res

In [769]:
# Input: topics
# Output: list of sets of topic keywords

def get_topics_keywords(topics):
    topics_keywords = []

    for t in topics:
        kw = get_topic_keywords_set(t)
        topics_keywords.append(kw)
        
    return topics_keywords

In [770]:
# Input: 
# Output: list of sets of keyword ids for each topic

def get_topics_kw_wordid_sets(topics_keywords, model_dir):
    topics_kw_sets = []

    for kw in topics_keywords:
        topics_kw_sets.append(get_keywords_id_set(kw, model_dir))
        
    return topics_kw_sets

In [771]:
def get_doc_vector_dict(keyword_ids, t_dict, model_dir):
    f = open('.{}/file-list'.format(model_dir), 'r', encoding='utf-8')
    res = {}
    
    for i, line in enumerate(f):
        # find tf : number of times this word occurs in the document i
        v = []
        for kw_id in list(keyword_ids):
            if i in t_dict[kw_id]:
                v.append(t_dict[kw_id][i])
            else:
                v.append(0)
    
        res[i] = norm_vec(v)
        
    return res
        

In [772]:
def get_doc_vectors_all_topics(keyword_id_set, t_dict):
    res = []
    
    for s in keyword_id_set:
        res.append(get_doc_vector_dict(s, t_dict, model_dir))
        
    return res

In [773]:
def norm_vec(vec):
    res = []
    m = max(vec)
    
    for v in vec:
        if m != 0:
            res.append(float(v)/m)
        else:
            res.append(0.0)
            
    return res

In [774]:
def get_query_vectors(kw_lists, idf_dict):
    res = []
    
    for l in kw_lists:
        v = []
        for kw in l:
            v.append(idf_dict[kw])
        res.append(v)
        
    return res

In [775]:
def get_scores(qv, dv):
    res = []
    
    for i in range(len(qv)):
        tmp = {}
        for k, v in dv[i].items():
            tmp[k] = np.dot(qv[i], np.array(v))
        res.append(tmp)

    return res

In [776]:
def get_n_docs(score_vectors, n=5):
    res = []
    
    for s in score_vectors:
        res.append(dict(sorted(s.items(), key=lambda item: item[1], reverse=True)[:n]))
    
    return res

In [777]:
def get_doc_name(doc_id, model_dir):
    f = '.{}/file-list'.format(model_dir)
    with open(f) as f:
        for i, l in enumerate(f):
            if i == int(doc_id):
                s = str(l).strip()
                s = s.split('/')[3].lower()
                return s
    return 'None'

In [778]:
def create_prediction(d, model_dir, ranked_list_name):
    res = {}
    
    for i in range(len(d)):
        tmp = []
        for k, v in d[i].items():
            tmp.append(get_doc_name(k, model_dir))
        new_key = '0'+'{}'.format(i+11)
        res[new_key] = " ".join(tmp)
    
    res = pd.DataFrame.from_dict(res, orient='index', columns=['retrieved_docs'])
    res.index.name = 'query_id'
    
    res.to_csv('./{}.csv'.format(ranked_list_name))        

In [779]:
def get_N(model_dir):
    f = '.{}/file-list'.format(model_dir)
    with open(f) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [780]:
def get_idf(df, N):
    return np.log(N/df)

In [781]:
def execute(t_dict, r, query_file, ranked_list_name, model_dir, NTCIR_dir):
    cf_dict = dict([(x, sum(y.values())) for x, y in t_dict.items()])
    df_dict = dict([(x, len(y.values())) for x, y in t_dict.items()])
    N = get_N(model_dir)
    idf_dict = dict([(x, get_idf(df_dict[x], N)) for x, y in t_dict.items()])
    
    tree = ET.parse('.'+query_file)
    topics = get_topics(tree)
    
    topics_keywords = get_topics_keywords(topics)
    topics_kw_sets = get_topics_kw_wordid_sets(topics_keywords, model_dir)
    
    query_vectors = get_query_vectors(topics_kw_sets, idf_dict)
    query_vectors = np.array(query_vectors)

    doc_vectors = get_doc_vectors_all_topics(topics_kw_sets, t_dict, model_dir)
    score_vectors = get_scores(query_vectors, doc_vectors)
    rel_docs_scores = get_n_docs(score_vectors, 5)
    
    #     ROCCHIO FEEDBACK
    if r:
        Dr = get_dr(doc_vectors, rel_docs_scores)
        Dnr = get_dnr(doc_vectors, rel_docs_scores)
        query_vectors = rocchio_feedback(query_vectors, Dr, Dnr)
        doc_vectors = get_doc_vectors_all_topics(topics_kw_sets, t_dict, model_dir)
        score_vectors = get_scores(query_vectors, doc_vectors)
        rel_docs_scores = get_n_docs(score_vectors, 5)
    
    create_prediction(rel_docs_scores, model_dir, ranked_list_name)

In [782]:
def get_dr(doc_vectors, rel_docs_scores):
    res = []
    
    for i in range(len(rel_docs_scores)):
        tmp = []
        for k, v in rel_docs_scores[i].items():
            tmp.append(doc_vectors[i][k])
        res.append(tmp)

    return np.array(res)

In [783]:
def get_dnr(doc_vectors, rel_docs_scores):
    d_c = doc_vectors.copy()
    res = []
    
    for i in range(len(rel_docs_scores)):
        tmp = []
        for k, v in doc_vectors[i].items():
            if k not in rel_docs_scores[i]: 
                tmp.append(doc_vectors[i][k])
        res.append(tmp)

    return np.array(res)

In [784]:
def get_queries_v_sum(queries_vectors):
    res = []
    for qv in queries_vectors:
        ar = np.array(qv)
        s = np.add.reduce(ar)
        res.append(s)
        
    return np.array(res)

In [785]:
def rocchio_feedback(query_vectors, Dr, Dnr, a=1, b=0.75, g=0.15):
    res = []
    Dr_sum = get_queries_v_sum(Dr)
    Dnr_sum = get_queries_v_sum(Dnr)
    query_vectors = np.array(query_vectors)
    
    for i in range(len(query_vectors)):
        qm = a*query_vectors[i] + (b/len(Dr[i]))*Dr_sum[i] - (g/len(Dnr[i]))*Dnr_sum[i]
        res.append(qm)
        
    return res

In [786]:
argv = ['-r', '-i', '/queries/query-train.xml', '-o', 'ranked_list.csv', '-m', '/model', '-d', '/CIRB010']

In [787]:
r = False
query_file = None
ranked_list_name = None
model_dir = None
NTCIR_dir = None

for i in range(len(argv)-1):
    c = argv[i]
    
    if c == '-r':
        r = True
    elif c == '-i':
        query_file = argv[i+1]
    elif c == '-o':
        ranked_list_name = argv[i+1]
    elif c == '-m':
        model_dir = argv[i+1]
    elif c == '-d':
        NTCIR_dir = argv[i+1]

In [788]:
# word_id : (doc_id : occurrences)
if False:
    t_dict = get_wid_dict(model_dir)

In [789]:
execute(t_dict, r, query_file, ranked_list_name, model_dir, NTCIR_dir)

TypeError: get_doc_vector_dict() missing 1 required positional argument: 'model_dir'