In [669]:
import numpy as np
import xml.etree.ElementTree as ET

In [670]:
tree = ET.parse('./queries/query-train.xml')

In [671]:
def get_topics(tree):
    topics = []
    root = tree.getroot()
    
    for child in root:
        topics.append(child)
    
    return topics

In [672]:
# Input: topic, max number of keywords
# Output: set of strings of keywords

def get_topic_keywords_set(topic, n=1):
    f = ['\n', '、', '。', '，']
    keywords_dict = {}
    keywords = list(topic[1].text) + list(topic[2].text) + list(topic[3].text) + list(topic[4].text)
    keywords = [x for x in keywords if x not in f]

    for k in keywords:
        if str(k) in keywords_dict:
            keywords_dict[str(k)] += 1
        else:
            keywords_dict[str(k)] = 1
        
    res = sorted(keywords_dict.items(), key=lambda item: item[1])
    res = res[len(res)-n:]
    res = [x[0] for x in res]

    return set(res)
        

In [673]:
# Input: set of keyword strings
# Output: set of keyword_id ints

def get_keywords_id_set(keywords):
    kw_set = set()
    f = open('./model/vocab.all', 'r', encoding='utf-8')
    
    for l, w in enumerate(f.readlines(), 1):
        w = w.strip()
        if str(w) in keywords:
            kw_set.add(l)
    
    f.close()
    
    return kw_set
        

In [674]:
# Input: keyword id
# Output: a tuple of (tf, set of doc_id strings)

def get_kw_docs_set(kw_id):
    kw_id = str(kw_id)
    doc_set = set()
    f = open('./model/inverted-file', 'r', encoding='utf-8')
    found = False
    a = -1
    tf = 0
    
    for i, line in enumerate(f):
        if found and i > a:
            found = False
            a = -1
        l = list(str(line).strip().split(' '))
        # word id metadata
        if len(l) == 3:
            if l[0] == kw_id or l[1] == kw_id:
                found = True
#             if l[1] == '-1':
                a = i+int(l[2])
                tf += int(l[2])
        elif len(l) == 2 and a != -1 and found:
            doc_set.add(l[0])
    
    f.close()
    
    return (tf, doc_set)

In [675]:
# Input: set of strings of keyword ids
# Output: list of sets of doc_id with keyword

def get_topic_kw_doc_set(topic_kw_set):
    res = []
    
    for kw_id in topic_kw_set:
        print(kw_id)
        res.append(get_kw_docs_set(kw_id))
        
    return res

In [676]:
def get_topics_tf_docs(topics_kw_sets):
    res = []
    
    for t in topics_kw_sets:
        res.append(get_topic_kw_doc_set(t))
        
    return res

In [677]:
# Input: topics
# Output: list of sets of topic keywords

def get_topics_keywords(topics):
    topics_keywords = []

    for t in topics:
        kw = get_topic_keywords_set(t)
        topics_keywords.append(kw)
        
    return topics_keywords

In [678]:
# Input: 
# Output: list of sets of keyword ids for each topic

def get_topics_kw_wordid_sets(topics_keywords):
    topics_kw_sets = []

    for kw in topics_keywords:
        topics_kw_sets.append(get_keywords_id_set(kw))
        
    return topics_kw_sets

In [None]:
def get_topics_tf_idf(tf_docs):
    
    for t in tf_docs:
        

In [679]:
# def get_topic_kw_w(topic_kwid_set):
#     for kw_id in topic_kwid_set:
        

In [680]:
def get_df(kw_set):
    return len(kw_set)

In [681]:
def get_idf(df, N):
    return np.log(N/df)

In [682]:
def get_tf_idf_w(tf, N, kw_set):
    return tf*get_idf(get_df(kw_set), N)

In [683]:
def get_N(f='./model/file-list'):
    with open(f) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [684]:
# Process:
# 1. Get topics of query -> ET elements
# 2. Get keywords of each topic -> list of sets of strings
# 3. Get keyword ids of each topic -> list of sets of ints
# 4. Get topics keyword document_id dicts and tf -> list of dicts (doc_id : kw_id)
# 5. For each topic for each keyword:
#     get df
#     get idf
#     get tf-idf weighting

In [685]:
# Step 1
topics = get_topics(tree)

In [686]:
# Step 2
topics_keywords = get_topics_keywords(topics)

In [687]:
# Step 3
topics_kw_sets = get_topics_kw_wordid_sets(topics_keywords)

In [692]:
topics_kw_sets

[{6002},
 {7519},
 {7401},
 {9011},
 {6266},
 {10512},
 {9011},
 {6919},
 {11460},
 {6910}]

In [688]:
tf_docs = get_topics_tf_docs(topics_kw_sets)

6002
7519
7401
9011
6266
10512
9011
6919
11460
6910


In [691]:
tf_docs[1]

[(17, {'13499', '1813', '24857', '26583', '26869', '32896'})]

In [631]:
tf, s = res[0][0], res[0][1]
df = get_df(s)
N = get_N()

tf_idf = get_tf_idf_w(tf, N, s)
tf_idf

43635.19807351551