- Makes the feats for nodes and edges
- 3 variations for node feats listed below
- Changed from oring word encodings to adding word encodings

In [145]:
# Generate the oring of the one hot feats for each word in the title
import re
import pickle
from collections import OrderedDict
import numpy as np

In [146]:
INPUT_FILEPATH = '../data/animals-D3-small-30K-nodes40-edges202-max10-minout2-minin3.pkl'
NODE_VARIATION = 2
# 0: only title; 1: title and link feats soup; 2: concat title and link feats (zero-pad) 
if NODE_VARIATION == 0:
    OUTPUT_FILEPATH = '../data/animals-D3-small-30K-nodes40-edges202-max10-minout2-minin3_w_features_title.pkl'
elif NODE_VARIATION == 1:
    OUTPUT_FILEPATH = '../data/animals-D3-small-30K-nodes40-edges202-max10-minout2-minin3_w_features_soup.pkl'
else:
    OUTPUT_FILEPATH = '../data/animals-D3-small-30K-nodes40-edges202-max10-minout2-minin3_w_features_concat.pkl'

In [147]:
class Page:
    def __init__(self, text, links):
        self.text = text
        self.links = links  # out-links
        self.in_links = []
        self.indx = None  # Relative to the ordered dict below
        self.feats = None
        self.links_feats = None

In [148]:
# Load the wiki-dict i want
with open(INPUT_FILEPATH, 'rb') as f:
    pages = pickle.load(f)
pages = OrderedDict(pages)

In [149]:
replace_chars = lambda title: re.sub(r"[()]", '', title)

In [150]:
def create_dict(titles):
    # Split each title into its words and ignore ()
    words = set()
    for title in titles:
        title = replace_chars(title)
        title_words = title.split()
        words.update(title_words)
    return words

In [151]:
titles = list(pages.keys())
words_set = create_dict(titles)

In [152]:
def one_hot_encode(words_set):
    num_words = len(words_set)
    print('Dict length: {}'.format(num_words))
    dic = {}
    for i, word in enumerate(words_set):
        encoding = np.zeros(shape=(num_words,), dtype=np.int)
        encoding[i] = 1
        dic[word] = encoding
    return dic

In [153]:
encoding_dic = one_hot_encode(words_set)

Dict length: 42


In [154]:
# Given a title convert to soup of 0s and 1s
def get_encoding(title, dic):
    title_encode = np.zeros(shape=(len(dic,)), dtype=np.int)  # Init encode is all zeros since im oring
    title_ = replace_chars(title)
    title_words = title_.split()
    for word in title_words:
        word_encoding = dic[word]
        title_encode = np.logical_or(title_encode, word_encoding, dtype=np.int).astype(np.int)
#         title_encode += word_encoding
    return title_encode

In [155]:
def add_node_feats_title_only(pages, dic):
    for title, obj in list(pages.items()):
        obj.feats = get_encoding(title, dic)

In [156]:
def add_node_feats_soup(pages, dic):
    for title, obj in list(pages.items()):
#         print('\n\n\n=================')
        obj.feats = get_encoding(title, dic)
#         print('title: {}'.format(obj.feats))
        for link in obj.links:
            obj.feats += get_encoding(link, dic)
#             print('link: {}'.format(get_encoding(link, dic)))
#         print('full: {}'.format(obj.feats))

In [157]:
def get_max_num_links(pages):
    curr = float('-inf')
    for title, obj in list(pages.items()):
        num_links = len(obj.links)
        curr = max(num_links, curr)
    return curr

In [158]:
def add_node_feats_concat(pages, dic):
    max_num_links = get_max_num_links(pages)
    length_encode = len(dic)
#     print('max: {}'.format(max_num_links))
    for title, obj in list(pages.items()):
    #     print('\n\n\n\n=====================')
        title_encode = get_encoding(title, dic)
#         print('title encode: {}'.format(title_encode))
        all_encodings = [title_encode]
        for link in obj.links:
            link_encode = get_encoding(link, dic)
#             print('link encode: {}'.format(link_encode))
            all_encodings.append(link_encode)
        concat = np.concatenate(all_encodings)
        # Now i need to zero pad, all links and title
        full_concat = np.zeros(shape=(max_num_links * length_encode + length_encode,))
        full_concat[:concat.shape[0]] = concat
        obj.feats = full_concat
        assert concat.sum() == full_concat.sum()
#         print(concat)
#         print('------')
#         print(full_concat)

In [159]:
if NODE_VARIATION == 0:
    add_node_feats_title_only(pages, encoding_dic)
elif NODE_VARIATION == 1:
    add_node_feats_soup(pages, encoding_dic)
else:
    add_node_feats_concat(pages, encoding_dic)

In [160]:
def add_edge_feats(pages, dic):
    for title, obj in list(pages.items()):
        edge_dic = {}
        for link in obj.links:
            edge_dic[link] = get_encoding(link, dic)
        obj.links_feats = edge_dic

In [161]:
add_edge_feats(pages, encoding_dic)

In [162]:
def print_encodings(dic):
    for k, v in list(dic.items()):
        print('{}: {}'.format(k, v))
# print_encodings(encoding_dic)

In [163]:
# for title, obj in list(pages.items()):
#     for link in obj.links:
#         print('{}: {}'.format(link, obj.links_feats[link]))

In [164]:
# for title, obj in list(pages.items()):
#     print('{}: {}'.format(title, obj.feats))

In [165]:
with open(OUTPUT_FILEPATH, 'wb') as f:
    pickle.dump(pages, f)