In [1]:
import pandas as pd
import json
import re
import ast 
import mwparserfromhell
import pickle as pkl
from bz2 import BZ2File as bzopen

In [7]:
# set file names for dumping pickle files and input data source
WIKI_TEXT_FILE = 'data/sample_1pct_wikitext.json'
TEXT_OUTPUT_FILE_OUTLINKS = 'data/wiki_outlinks_df.p'
TEXT_OUTLINKS = 'data/en_outlinks.p'
MAPPING_FILE = "data/title_pid_afterredirect.tsv.bz2"

In [3]:
# title_pageId_map = pd.read_csv('data/title_pid_afterredirect.tsv.bz2', delimiter="\t", header=None)
# title_pageId_map.columns= ['Title', 'PageId']

In [4]:
## Data loading
def getTitlePageIdDictionary(fileName):
    titlePageId = {}
    with bzopen(fileName, "r") as f:
        lines = [x.decode('utf8').strip() for x in f.readlines()]
        for _ , line in enumerate(lines):
            try:
                key, value = line.rstrip('\n').split('\t')
                key = lowerFirstCharacter(key)
                titlePageId[key] = value
            except:
                # lines which failed
                print(line)
    return titlePageId

# get the tokenized dataframe containing - QID, Word Tokens & Categories
def getWikiOutlinks(file_name):
    wiki_dict = []
    with open(file_name) as file:
         for line in file:
            wiki_row = {}
            line = json.loads(line.strip())
            wikitext = mwparserfromhell.parse(line['wikitext'])
            wikilinks = wikitext.filter_wikilinks()
            wiki_row['QID'] = line['QID']
            wiki_row['outlinks'] = str(wikilinks)
            wiki_dict.append(wiki_row)       
    wiki_df = pd.DataFrame(wiki_dict)
    return wiki_df

## Data pre-processing and clean-up 
def lowerFirstCharacter(val):
    text = str(val)
    if len(text) == 0:
        return ""
    elif len(text) == 1:
        return text[0].lower()
    else:
        return text[0].lower() + text[1:]
    

def lowerFirstCharacterTokens(tokens):
    result = []
    for item in tokens:
        result.append(lowerFirstCharacter(item))
    return result

def removeOutlinksWithNamespaces(tokens):
    result = []
    for item in tokens:
        match = re.search(r"\[\[[^\]]*:[^\]]*\]\]", item)
        if not match:
            result.append(item)
    return result

def getOutlinkTitle(tokens):
    pattern = r"(\[\[.*?\|)"
    result = []
    for item in tokens:
        match = re.search(pattern, item)
        if match:
            result.append(match.group(1).replace("[","").replace("|",""))
        else:
            result.append(item.replace("[","").replace("]",""))
    return result

def replaceSpace(tokens):
    result = []
    for item in tokens:
        result.append(item.replace(" ","_"))
    return result
    
def toList(sent):
    return ast.literal_eval(sent)

def mapPageId(tokens):
    result = []
    for item in tokens:
        val = titlePageIdMap.get(item, "")
        result.append(val)
    return result

In [5]:
# wiki_df = getWikiOutlinks(WIKI_TEXT_FILE)
#pkl.dump(wiki_df, open(TEXT_OUTPUT_FILE_OUTLINKS, "wb"))

# load the dataframe from pickle file 
wiki_df =  pkl.load(open(TEXT_OUTPUT_FILE_OUTLINKS, "rb"))

In [6]:
titlePageIdMap = getTitlePageIdDictionary(MAPPING_FILE)

238775


In [7]:
wiki_df["outlinks"] = wiki_df["outlinks"].apply(toList) \
                                         .apply(removeOutlinksWithNamespaces) \
                                         .apply(getOutlinkTitle) \
                                         .apply(replaceSpace) \
                                         .apply(lowerFirstCharacterTokens)

In [8]:
wiki_df["outlinks"] = wiki_df["outlinks"].apply(mapPageId)

Unnamed: 0,QID,outlinks
0,Q2000864,"[3637937, 228776, 804778, 3747, 1687318, 19261..."
1,Q1064113,"[18402, 701, 787824, 10634933, 3910102, 468072..."
2,Q6941060,"[37585, 1019536, 5058739, 18999, 37283486, 283..."
3,Q843920,"[4239514, 37589, 17730, 24096, 24096, 7500259,..."
4,Q178999,"[21120, 156998, 1306158, 859926, 2435371, 5634..."
...,...,...
103179,Q65030822,"[12067, 548981, , 7718199, 6643338, 54754514, ..."
103180,Q65030843,"[44298710, 27862, 44298710, 113362, 187504]"
103181,Q64969554,"[47863145, 1780651, 30944026, 559753, 775859, ..."
103182,Q21190193,"[14010835, 1427462, 2275, 63429, 15032, 7293, ..."


In [8]:
pkl.dump(wiki_df, open(TEXT_OUTLINKS, "wb"))
wiki_df =  pkl.load(open(TEXT_OUTLINKS, "rb"))

In [9]:
wiki_df

Unnamed: 0,QID,outlinks
0,Q2000864,"[3637937, 228776, 804778, 3747, 1687318, 19261..."
1,Q1064113,"[18402, 701, 787824, 10634933, 3910102, 468072..."
2,Q6941060,"[37585, 1019536, 5058739, 18999, 37283486, 283..."
3,Q843920,"[4239514, 37589, 17730, 24096, 24096, 7500259,..."
4,Q178999,"[21120, 156998, 1306158, 859926, 2435371, 5634..."
...,...,...
103179,Q65030822,"[12067, 548981, , 7718199, 6643338, 54754514, ..."
103180,Q65030843,"[44298710, 27862, 44298710, 113362, 187504]"
103181,Q64969554,"[47863145, 1780651, 30944026, 559753, 775859, ..."
103182,Q21190193,"[14010835, 1427462, 2275, 63429, 15032, 7293, ..."
