In [1]:
import json
import jsonlines
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.functional as F
from functools import partial

In [2]:
# set source file name
INLINKS_RAW_FILE_NAME = 'data/en_inlinks.json'

OUTLINKS_PROCESSED_FILE = 'data/en_outlinks.p'
INLINKS_PROCESSED_FILE = 'data/wiki_links_df.p'

FINAL_PROCESSED_FILE = 'data/en_all.p'

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:
def get_wiki_links(file_name):
    wiki_dict = []
    with open(FILE_NAME) as file:
         for line in file:
            wiki_row = {}
            line = json.loads(line.strip())
            wiki_row['qid'] = line['qid']
            wiki_row['title'] = line['entitle']
            wiki_row['rid'] = line['rid']
            wiki_row['pid'] = line['pid']
            wiki_row['inlinks'] = line['inlinks']
            wiki_row['mid_level_categories'] = line['mid_level_categories']
            wiki_dict.append(wiki_row)
            
    wiki_df = pd.DataFrame(wiki_dict)
    return wiki_df

In [4]:
# wiki_df = get_wiki_links(INLINKS_RAW_FILE_NAME)
# pkl.dump(wiki_df, open(INLINKS_PROCESSED_FILE, "wb"))

In [5]:
# load the dataframe from pickle file 
wiki_df =  pkl.load(open(INLINKS_PROCESSED_FILE, "rb"))

In [6]:
wiki_df.head(2)

Unnamed: 0,qid,title,rid,pid,inlinks,mid_level_categories
0,Q18022170,Edwin Brown (actor),853705188,43808614,"[[158976, 0], [187709, 0], [242465, 0], [36406...",[Culture.People]
1,Q28452584,A Merry Little Christmas (Matt Brouwer album),895288516,52174626,"[[14619462, 0], [27136451, 0], [35008514, 0], ...",[Culture.Music]


In [7]:
# https://en.wikipedia.org/wiki/Wikipedia:Namespace
# create dictionary based on the namespaces currently in use in wikipedia
subject_namespaces = {
     0:'article', 
     2:'user',
     4:'wikipedia', 
     6:'file', 
     8:'media-wiki', 
     10:'template',
     12:'help',
     14:'category',
     100:'portal',
     108:'book',
     118:'draft',
     710:'timed-text',
     828:'module'
    }
    
    
talk_namespaces = {
     1:'article', 
     3:'user',
     5:'wikipedia', 
     7:'file', 
     9:'media-wiki', 
     11:'template',
     13:'help',
     15:'category',
     101:'portal',
     109:'book',
     119:'draft',
     711:'timed-text',
     829:'module'
    }


# ignore list (namespaces not used any longer)
ignore_namespaces = [446, 447, 2300, 2301, 2302, 2303]

In [8]:
def get_namespaces(tuples):
    result = []
    for _tuple in tuples:
        if _tuple[1] not in result:
            result.append(_tuple[1])
    return result

def expand_links(namespace, tuples):
    result = []
    for _tuple in tuples:
        if _tuple[1] == namespace:
            result.append(_tuple[0])
    return result

In [9]:
wiki_df['namespaces'] = wiki_df['inlinks'].apply(get_namespaces)

# get the list of unique namespaces in the links
unique_namespaces = []

for lst in wiki_df.namespaces:
    for item in lst:
        if item not in unique_namespaces:
            unique_namespaces.append(item)

for item in ignore_namespaces:
    if item in unique_namespaces:
        unique_namespaces.remove(item)
        
for item in unique_namespaces:
    if item % 2 == 0:
        prefix = 'subject'
        namespace = subject_namespaces[item]
    else:
        prefix = 'talk'
        namespace = talk_namespaces[item]
        
    wiki_df[prefix + '_' + namespace] = wiki_df['inlinks'].apply((lambda x : expand_links(item, x)))

In [10]:
# prepare dataset for model
columns_to_model = ['qid','pid','mid_level_categories', 'subject_article']

wiki_inlinks = wiki_df[columns_to_model]
wiki_inlinks = wiki_inlinks.rename(columns={'subject_article':"pid_inlinks"})

In [11]:
wiki_outlinks = pkl.load(open(OUTLINKS_PROCESSED_FILE, "rb"))
wiki_outlinks = wiki_outlinks.rename(columns={'outlinks':"pid_outlinks"})

In [12]:
wiki_all = pd.merge(wiki_inlinks, wiki_outlinks, left_on='qid', right_on='QID', how='inner').drop(columns=['QID'])

In [13]:
#Removing rows with missing labels
mask = wiki_all.mid_level_categories.apply(lambda x: len(x) > 0)
wiki_missing_labels = wiki_all[~mask]
wiki_all = wiki_all[mask]
wiki_all = wiki_all.reset_index(drop=True)
print("Entries left after filtering")
print(wiki_all.shape)

print("Print Sample Entries from Wiki-Missing Label")
wiki_missing_labels.head(2)

Entries left after filtering
(99770, 5)
Print Sample Entries from Wiki-Missing Label


Unnamed: 0,qid,pid,mid_level_categories,pid_inlinks,pid_outlinks
111,Q6936272,51735433,[],[43730243],"[26773749, 39117688]"
132,Q7878232,35730129,[],[33943904],"[448609, 72243, 1098730, 48768, 1098730]"


In [14]:
#Removing rows with no inlinks or oulinks tokens
mask_1 = wiki_all.pid_inlinks.apply(lambda x: len(x) > 0)
mask_2 = wiki_all.pid_outlinks.apply(lambda x: len(x) > 0)
wiki_missing_inlinks_and_outlinks = wiki_all[~(mask_1 & mask_2)]
wiki_all = wiki_all[mask_1 & mask_2]
wiki_all = wiki_all.reset_index(drop=True)
print("Entries left after filtering")
print(wiki_all.shape)
print("Print Sample Entries from Wiki-Missing Inlinks and Outlinks")
wiki_missing_inlinks_and_outlinks.head(2)

Entries left after filtering
(96873, 5)
Print Sample Entries from Wiki-Missing Inlinks and Outlinks


Unnamed: 0,qid,pid,mid_level_categories,pid_inlinks,pid_outlinks
54,Q1225863,32466306,[Geography.Europe],[],"[3463, 15940363]"
73,Q6400557,32924115,"[Culture.Philosophy and religion, Geography.Asia]",[],"[14533, 250724, 799384, 28849, 33101939, 79938..."


In [15]:
# Binarize the labels
# labels list: mlb.classes_
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
wiki_all["labels"] = list(mlb.fit_transform(wiki_all.mid_level_categories))
wiki_all.head(2)

Unnamed: 0,qid,pid,mid_level_categories,pid_inlinks,pid_outlinks,labels
0,Q18022170,43808614,[Culture.People],"[158976, 187709, 242465, 364061, 858518, 23705...","[31523, 35730609, 158976, 242465, 19101343]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Q28452584,52174626,[Culture.Music],"[14619462, 27136451, 35008514, 30489911]","[14619462, 154403, 7572, 14619462, 27136451, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [18]:
#pkl.dump(wiki_all, open(FINAL_PROCESSED_FILE, "wb"))
wiki_all =  pkl.load(open(FINAL_PROCESSED_FILE, "rb"))

In [17]:
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [None]:
# train/val/test split
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_all, seed=1)

wiki_train = wiki_train.reset_index(drop=True)
wiki_valid = wiki_valid.reset_index(drop=True)
wiki_test = wiki_test.reset_index(drop=True)

In [None]:
# Building vocabulary
vocab = list(set([y for x in list(wiki_train['pid_inlinks']) for y in x]))
vocab += list(set([y for x in list(wiki_train['pid_outlinks']) for y in x]))

#only get the unique tokens
vocab = list(set(vocab))
print("Vocab size is: {}".format(len(vocab)))

In [None]:
word_to_index = {"<pad>":0, "<unk>":1}
for word in vocab:
    if word not in word_to_index:
        word_to_index[word] = len(word_to_index)
index_to_word = {v:k for k, v in word_to_index.items()}

In [None]:
def tokenize_dataset(dataset, word_to_index):
    _current_dictified_1 = []
    _current_dictified_2 = []
    for l in tqdm(dataset['pid_inlinks']):
        encoded_l = [word_to_index[i] if i in word_to_index else word_to_index['<unk>'] for i in l]
        _current_dictified_1.append(encoded_l)
    
    for l in tqdm(dataset['pid_outlinks']):
        encoded_l = [word_to_index[i] if i in word_to_index else word_to_index['<unk>'] for i in l]
        _current_dictified_2.append(encoded_l)
        
    return _current_dictified_1, _current_dictified_2

In [None]:
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader

class TensoredDataset(Dataset):
    def __init__(self, tokenized_inlinks, tokenized_outlinks ,targets):
        self.inlink_tensors = []
        self.outlink_tensors = []
        self.target_tensors = []
        self.inlink_len = []
        self.outlink_len = []
        
        for i in range(len(tokenized_inlinks)):
            self.inlink_tensors.append(torch.LongTensor(tokenized_inlinks[i]))
            self.outlink_tensors.append(torch.LongTensor(tokenized_outlinks[i]))
            self.target_tensors.append(torch.LongTensor(targets[i]))
            self.inlink_len.append(torch.FloatTensor([len(tokenized_inlinks[i])]))
            self.outlink_len.append(torch.FloatTensor([len(tokenized_outlinks[i])]))
    
    def __len__(self):
        return len(self.inlink_tensors)
    
    def __getitem__(self, idx):
        # return a (input, target) tuple
        return (self.inlink_tensors[idx], self.inlink_len[idx], self.outlink_tensors[idx], self.outlink_len[idx],self.target_tensors[idx])

In [None]:
def pad_list_of_tensors(list_of_tensors, pad_token):
    max_length = max([t.size(-1) for t in list_of_tensors])
    padded_list = []
    
    for t in list_of_tensors:
        #print(t.reshape(1, -1).shape)
        #print(torch.tensor([[pad_token]*(max_length - t.size(-1))])[0].shape)
        padded_tensor = torch.cat([t.reshape(1, -1), torch.LongTensor([[pad_token]*(max_length - t.size(-1))])], dim = -1)
        padded_list.append(padded_tensor)
    padded_tensor = torch.cat(padded_list, dim=0)
    return padded_tensor

def pad_collate_fn(batch, word_to_index):
    # batch is a list of sample tuples
    inlink_list = [s[0] for s in batch]
    inlink_length = [s[1] for s in batch]
    outlink_list = [s[2] for s in batch] 
    outlink_length = [s[3] for s in batch]
    target_list = [s[4] for s in batch]
    
    #pad_token = persona_dict.get_id('<pad>')
    pad_token = word_to_index['<pad>']
    
    inlink_tensor = pad_list_of_tensors(inlink_list, pad_token)    
    inlink_length_tensor = torch.stack(inlink_length)
    outlink_tensor = pad_list_of_tensors(outlink_list, pad_token)    
    outlink_length_tensor = torch.stack(outlink_length)
    
    target_tensor = torch.stack(target_list)
    
    return inlink_tensor, inlink_length_tensor, outlink_tensor, outlink_length_tensor, target_tensor 


In [None]:
wiki_tokenized_train_inlink, wiki_tokenized_train_outlink = tokenize_dataset(wiki_train, word_to_index)
wiki_tokenized_val_inlink, wiki_tokenized_val_outlink = tokenize_dataset(wiki_valid, word_to_index)
wiki_tokenized_test_inlink, wiki_tokenized_test_outlink = tokenize_dataset(wiki_test, word_to_index)

In [None]:
wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(
    wiki_tokenized_train_inlink, wiki_tokenized_train_outlink, list(wiki_train.labels)
)
wiki_tensor_dataset['val'] = TensoredDataset(
    wiki_tokenized_val_inlink, wiki_tokenized_val_outlink, list(wiki_valid.labels)
)
wiki_tensor_dataset['test'] = TensoredDataset(
   wiki_tokenized_test_inlink, wiki_tokenized_test_outlink, list(wiki_test.labels)
)

In [None]:
wiki_tensor_dataset["train"].__getitem__(100)

In [None]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict

# c - num classifiers
# b - batch
# l - length (padded)

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, dim_e):
        """
        @param vocab_size: size of the vocabulary. 
        @param dim_e: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx
        self.embed = nn.Embedding(vocab_size, dim_e, padding_idx=0)
       
    def forward(self, s1, l1, s2, l2):
        """
        Take average of all words in the text.
        
        @param data_bl: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length_b: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out1 = self.embed(s1)
        out2 = self.embed(s2)
        out1 = torch.sum(out1, dim=-2) 
        out1 /= l1.float()
        out2 = torch.sum(out2, dim=-2)
        out2 /= l2.float()
        out = torch.cat([out1, out2], dim=-1)
        
        return out

    
def FeedForward(in_features, mid_features, out_features=44, num_layers=1, activation=nn.ReLU(), dropout_rate=0.2):
    """
    Function that creates sequential model (nn.Module) with specified number of layers.
    If 1 layer, returns linear model.
    """
    if num_layers == 1:
        return nn.Linear(in_features, out_features)
    return nn.Sequential(
        nn.Linear(in_features, mid_features),
        *([activation, nn.Dropout(dropout_rate), nn.Linear(mid_features, mid_features)] * max(0, (num_layers - 2))),
        *[activation, nn.Linear(mid_features, out_features)]
    )


class LinkModel(nn.Module):
    """
    Final model that combines embeddings of words in an article (average) and puts it through layer_out.
    """
    def __init__(self, options):
        super(LinkModel, self).__init__()
        
        self.layer_bag_of_words = BagOfWords(options["VOCAB_SIZE"], options["dim_e"])
        self.layer_out = FeedForward(
            in_features=options["dim_e"]*2,
            mid_features=options["mid_features"],
            out_features=options["num_classes"], 
            num_layers=options["num_layers"],
            dropout_rate=options["dropout_rate"],
            activation=options["activation"]
        )

    def forward(self, s1, l1, s2, l2):
        # get embeddings
        embed_be = self.layer_bag_of_words(s1, l1, s2, l2)
        # use layer_out
        out_bc = self.layer_out(embed_be)
        return out_bc
    

In [None]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": 32,
    "num_layers": 2,
    "num_classes": len(mlb.classes_),
    "mid_features": 128,
    "dropout_rate": 0.15,
    "activation": nn.ReLU()
}
model = LinkModel(options)

model

In [None]:
if torch.cuda.is_available():
    model = model.to(device)

# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
import warnings
warnings.filterwarnings('ignore')

from utils import test_model

best_val_f1_micro = 0
num_epochs = 20
for epoch in range(num_epochs):
    runnin_loss = 0.0
    for i, (data_1, length_1, data_2, length_2, labels) in enumerate(wiki_loaders["train"]):        
        model.train()
        inlink_batch, inlink_length_batch, outlink_batch, outlink_length_batch, label_batch = data_1.to(device),length_1.to(device),data_2.to(device),length_2.to(device), labels.float().to(device)

        optimizer.zero_grad()
        outputs = model(inlink_batch, inlink_length_batch, outlink_batch, outlink_length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()

        runnin_loss += loss.item()
        #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
        if i>0 and i % 300 == 0:
            print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
        # validate every 300 iterations
        if i > 0 and i % 300 == 0:
            metrics_dict = test_model(wiki_loaders["val"], model, device=device)
            print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
            ))
            print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
            ))

In [None]:
# Results just based on in-links
# options = {
#     "VOCAB_SIZE": len(index_to_word),
#     "dim_e": 128,
#     "num_layers": 2,
#     "num_classes": len(mlb.classes_),
#     "mid_features": 256,
#     "dropout_rate": 0.15,
#     "activation": nn.LeakyReLU(negative_slope=0.01)
# }

# Precision macro: 0.5473753851993785, Recall macro: 0.32252769088069094, F1 macro: 0.3732075444511734 
# Precision micro: 0.5513415892672858, Recall micro: 0.5107857783089333, F1 micro: 0.5302894010360123 