In [386]:
import numpy as np
import xml.etree.ElementTree as ET
import pandas as pd

In [387]:
def get_topics(tree):
    topics = []
    root = tree.getroot()
    
    for child in root:
        topics.append(child)
    
    return topics

In [388]:
def get_file_line_length(f):
    with open(f) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [389]:
# Input: topic, max number of keywords
# Output: set of strings of keywords

def get_topic_keywords_set(topic, n=5):
    f = ['\n', '、', '。', '，']
    keywords_dict = {}
    keywords = list(topic[1].text) + list(topic[2].text) + list(topic[3].text) + list(topic[4].text)
    keywords = [x for x in keywords if x not in f]

    for k in keywords:
        if str(k) in keywords_dict:
            keywords_dict[str(k)] += 1
        else:
            keywords_dict[str(k)] = 1
        
    res = sorted(keywords_dict.items(), key=lambda item: item[1])
    res = res[len(res)-n:]
    res = [x[0] for x in res]

    return res
        

In [390]:
# Input: set of keyword strings
# Output: set of keyword_id ints

def get_keywords_id_set(keywords):
    kw_set = []
    f = open('./model/vocab.all', 'r', encoding='utf-8')
    
    for l, w in enumerate(f.readlines()):
        w = w.strip()
        if str(w) in keywords:
            kw_set.append(l)
    
    f.close()
    
    return kw_set
        

In [391]:
# Input: keyword id
# Output: a tuple of (tf, set of doc_id strings)

def get_kw_docs_set(kw_id):
    kw_id = str(kw_id)
    doc_set = set()
    f = open('./model/inverted-file', 'r', encoding='utf-8')
    found = False
    a = -1
    tf = 0
    
    for i, line in enumerate(f):
        if found and i > a:
            found = False
            a = -1
        l = list(str(line).strip().split(' '))
        # word id metadata
        if len(l) == 3:
            if l[0] == kw_id or l[1] == kw_id:
                found = True
#             if l[1] == '-1':
                a = i+int(l[2])
                tf += int(l[2])
        elif len(l) == 2 and a != -1 and found:
            doc_set.add(l[0])
    
    f.close()
    
    return (tf, doc_set)

In [392]:
# Input: keyword id
# Output: a tuple of (tf, set of doc_id strings)

def get_wid_dict():
    f = open('./model/inverted-file', 'r', encoding='utf-8')
    res = {}
    curr = None
    
    for i, line in enumerate(f):
        
        l = list(str(line).strip().split(' '))
        
        if len(l) == 3:
            curr = int(l[0])
            if curr not in res:
                res[curr] = {}
        elif len(l) == 2:
            if int(l[0]) in res[curr]:
                res[curr][int(l[0])] += int(l[1])
            else:
                res[curr][int(l[0])] = int(l[1])
    
    f.close()
    
    return res

In [393]:
# Input: set of strings of keyword ids
# Output: list of sets of doc_id with keyword

def get_topic_kw_doc_set(topic_kw_set):
    res = []
    
    for kw_id in topic_kw_set:
        print(kw_id)
        res.append(get_kw_docs_set(kw_id))
        
    return res

In [394]:
def get_topics_tf_docs(topics_kw_sets):
    res = []
    
    for t in topics_kw_sets:
        res.append(get_topic_kw_doc_set(t))
        
    return res

In [395]:
# Input: topics
# Output: list of sets of topic keywords

def get_topics_keywords(topics):
    topics_keywords = []

    for t in topics:
        kw = get_topic_keywords_set(t)
        topics_keywords.append(kw)
        
    return topics_keywords

In [396]:
# Input: 
# Output: list of sets of keyword ids for each topic

def get_topics_kw_wordid_sets(topics_keywords):
    topics_kw_sets = []

    for kw in topics_keywords:
        topics_kw_sets.append(get_keywords_id_set(kw))
        
    return topics_kw_sets

In [397]:
def get_doc_vector_dict(keyword_ids, t_dict):
    f = open('./model/file-list', 'r', encoding='utf-8')
    res = {}
    
    for i, line in enumerate(f):
        # find tf : number of times this word occurs in the document i
        v = []
        for kw_id in list(keyword_ids):
            if i in t_dict[kw_id]:
                v.append(t_dict[kw_id][i])
            else:
                v.append(0)
    
        res[i] = norm_vec(v)
        
    return res
        

In [398]:
def get_doc_vectors_all_topics(keyword_id_set, t_dict):
    res = []
    
    for s in keyword_id_set:
        res.append(get_doc_vector_dict(s, t_dict))
        
    return res

In [399]:
def norm_vec(vec):
    res = []
    m = max(vec)
    
    for v in vec:
        if m != 0:
            res.append(float(v)/m)
        else:
            res.append(0.0)
            
    return res

In [400]:
def get_query_vectors(kw_lists, idf_dict):
    res = []
    
    for l in kw_lists:
        v = []
        for kw in l:
            v.append(idf_dict[kw])
        res.append(v)
        
    return res

In [401]:
def get_scores(qv, dv):
    res = []
    
    for i in range(len(qv)):
        tmp = {}
        for k, v in dv[i].items():
            tmp[k] = np.dot(qv[i], np.array(v))
        res.append(tmp)

    return res

In [402]:
def get_n_docs(score_vectors, n=5):
    res = []
    
    for s in score_vectors:
        res.append(dict(sorted(s.items(), key=lambda item: item[1], reverse=True)[:n]))
    
    return res

In [403]:
def get_doc_name(doc_id, f='./model/file-list'):
    with open(f) as f:
        for i, l in enumerate(f):
            if i == int(doc_id):
                s = str(l).strip()
                s = s.split('/')[3].lower()
                return s
    return 'None'

In [404]:
def create_prediction(d):
    res = {}
    
    for i in range(len(d)):
        tmp = []
        for k, v in d[i].items():
            tmp.append(get_doc_name(k))
        new_key = '0'+'{}'.format(i+11)
        res[new_key] = " ".join(tmp)
    
    res = pd.DataFrame.from_dict(res, orient='index', columns=['retrieved_docs'])
    res.index.name = 'query_id'
    
    res.to_csv('./pred.csv')
#     for i in range(len(res)):
        

In [405]:
def get_N(f='./model/file-list'):
    with open(f) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [409]:
def get_idf(df, N):
    return np.log(N/df)

In [412]:
# word_id : (doc_id : occurrences)
if False:
    t_dict = get_wid_dict()

In [None]:
def execute(t_dict, r=False):
    cf_dict = dict([(x, sum(y.values())) for x, y in t_dict.items()])
    df_dict = dict([(x, len(y.values())) for x, y in t_dict.items()])
    idf_dict = dict([(x, get_idf(df_dict[x], N)) for x, y in t_dict.items()])
    N = get_N()
    tree = ET.parse('./queries/query-train.xml')
    topics = get_topics(tree)
    topics_keywords = get_topics_keywords(topics)
    topics_kw_sets = get_topics_kw_wordid_sets(topics_keywords)
    query_vectors = get_query_vectors(topics_kw_sets, idf_dict)
    query_vectors = np.array(query_vectors)
    
#     ROCCHIO FEEDBACK
    if r:
        Dr = get_dr(doc_vectors, rel_docs_scores)
        Dnr = get_dnr(doc_vectors, rel_docs_scores)
        query_vectors = rocchio_feedback(query_vectors, Dr, Dnr)

    doc_vectors = get_doc_vectors_all_topics(topics_kw_sets, t_dict)
    score_vectors = get_scores(query_vectors, doc_vectors)
    rel_docs_scores = get_n_docs(score_vectors, 5)
    create_prediction(rel_docs_scores)

In [413]:
# cf_dict = dict([(x, sum(y.values())) for x, y in t_dict.items()])

In [414]:
# df_dict = dict([(x, len(y.values())) for x, y in t_dict.items()])

In [415]:
# N = get_N()
# idf_dict = dict([(x, get_idf(df_dict[x], N)) for x, y in t_dict.items()])

In [416]:
# # Step 1
# tree = ET.parse('./queries/query-train.xml')
# topics = get_topics(tree)

In [417]:
# # Step 2
# topics_keywords = get_topics_keywords(topics)

In [418]:
# Step 3
# topics_kw_sets = get_topics_kw_wordid_sets(topics_keywords)

In [419]:
# query_vectors = get_query_vectors(topics_kw_sets, idf_dict)
# query_vectors = np.array(query_vectors)

In [421]:
# doc_vectors = get_doc_vectors_all_topics(topics_kw_sets, t_dict)

In [422]:
# score_vectors = get_scores(query_vectors, doc_vectors)

In [423]:
# rel_docs_scores = get_n_docs(score_vectors, 5)

In [424]:
# create_prediction(rel_docs_scores)

In [425]:
def get_dr(doc_vectors, rel_docs_scores):
    res = []
    
    for i in range(len(rel_docs_scores)):
        tmp = []
        for k, v in rel_docs_scores[i].items():
            tmp.append(doc_vectors[i][k])
        res.append(tmp)

    return np.array(res)

In [426]:
def get_dnr(doc_vectors, rel_docs_scores):
    d_c = doc_vectors.copy()
    res = []
    
    for i in range(len(rel_docs_scores)):
        tmp = []
        for k, v in doc_vectors[i].items():
            if k not in rel_docs_scores[i]: 
                tmp.append(doc_vectors[i][k])
        res.append(tmp)

    return np.array(res)

In [431]:
def get_queries_v_sum(queries_vectors):
    res = []
    for qv in queries_vectors:
        ar = np.array(qv)
        s = np.add.reduce(ar)
        res.append(s)
        
    return np.array(res)

In [461]:
def rocchio_feedback(query_vectors, Dr, Dnr, a=1, b=0.75, g=0.15):
    res = []
    Dr_sum = get_queries_v_sum(Dr)
    Dnr_sum = get_queries_v_sum(Dnr)
    query_vectors = np.array(query_vectors)
    
    for i in range(len(query_vectors)):
        qm = a*query_vectors[i] + (b/len(Dr[i]))*Dr_sum[i] - (g/len(Dnr[i]))*Dnr_sum[i]
        res.append(qm)
        
    return res

In [462]:
Dr = get_dr(doc_vectors, rel_docs_scores)
Dnr = get_dnr(doc_vectors, rel_docs_scores)

In [463]:
rf = rocchio_feedback(query_vectors, Dr, Dnr)
rf

[array([0.83176125, 2.25629411, 4.16337349, 1.28125738, 1.28803409]),
 array([2.50750278, 2.71535418, 1.05449352, 3.46533002, 0.67509645]),
 array([0.82773274, 2.80275854, 0.86306955, 0.27789078, 2.68927761]),
 array([2.78425868, 2.10163288, 3.2241816 , 0.50296173, 4.43135641]),
 array([1.09182497, 7.65707563, 1.27200681, 7.25867988, 3.50191705]),
 array([1.20477518, 3.62509063, 2.90253552, 2.01570914, 1.40859662]),
 array([0.98919356, 3.20106276, 1.83474482, 0.80364288, 0.30535306]),
 array([1.21217908, 1.63060819, 1.84174696, 3.85440138, 3.55079197]),
 array([1.14274234, 0.5724579 , 0.30130011, 1.51513361, 1.25138636]),
 array([0.42860972, 3.50821697, 2.05089216, 3.55977762, 0.39689505])]