#This script explores the wikimark dataset and creates a pairwise version of the same.

In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from data.DataProcessor import get_section_tree, get_distance
import itertools
import plotly.express as px

wikimark_path = 'D://wikimarks_data//en-wiki-01012022'

In [2]:
train_hier_qrels_path = 'benchmarks//good-articles//good-articles.train//train.pages.cbor-hierarchical.qrels'
train_qrels = {}
with open(wikimark_path + '//' + train_hier_qrels_path, 'r') as f:
    for l in f:
        label = l.split(' ')[0]
        page = label.split('/')[0]
        para = l.split(' ')[2]
        if page not in train_qrels.keys():
            train_qrels[page] = {para: label}
        else:
            train_qrels[page][para] = label
train_section_counts = {}
for p in train_qrels.keys():
    train_section_counts[p] = Counter([train_qrels[p][para] for para in train_qrels[p].keys()])

In [3]:
def print_train_stats(train_qrels):
    print('No of pages: %d' % len(train_qrels))
    num_paras = np.array([len(train_qrels[p]) for p in train_qrels.keys()])
    print('Total paras: %d' % np.sum(num_paras))
    print('Mean paras per page: %.2f, std: %.2f, max: %d, min: %d' % (np.mean(num_paras), np.std(num_paras), np.max(num_paras), np.min(num_paras)))
    num_sections = np.array([len(list(train_section_counts[p])) for p in train_section_counts.keys()])
    print('Mean sections per page: %.2f, std: %.2f, max: %d, min: %d' % (np.mean(num_sections), np.std(num_sections), np.max(num_sections), np.min(num_sections)))
print_train_stats(train_qrels)

No of pages: 17357
Total paras: 415035
Mean paras per page: 23.91, std: 19.53, max: 289, min: 3
Mean sections per page: 8.80, std: 6.17, max: 73, min: 2


In [4]:
train_qrels_section_trees = {}
for page in train_qrels.keys():
    sections = list(set([train_qrels[page][para] for para in train_qrels[page].keys()]))
    train_qrels_section_trees[page] = get_section_tree(sections)
train_qrels_distances = {}
for page in train_qrels.keys():
    paras = list(train_qrels[page].keys())
    distances = []
    for p1, p2 in itertools.combinations(paras, 2):
        sec1 = train_qrels[page][p1]
        sec2 = train_qrels[page][p2]
        distances.append(get_distance(sec1, sec2, train_qrels_section_trees[page]))
    train_qrels_distances[page] = distances

In [7]:
pagelist = list(train_qrels.keys())
i = 3
pagedist = np.array(train_qrels_distances[pagelist[i]])
fig = px.histogram(pagedist)
fig.show()

In [11]:
print(pagelist[i])
paras = train_qrels[pagelist[i]]
print(len(paras))
pairs = [p for p in itertools.combinations(paras, 2)]
print(len(pairs))
for p in paras:
    print(train_qrels[pagelist[i]][p])

enwiki:%C3%81lvaro%20Betancourt
13
78
enwiki:%C3%81lvaro%20Betancourt
enwiki:%C3%81lvaro%20Betancourt
enwiki:%C3%81lvaro%20Betancourt
enwiki:%C3%81lvaro%20Betancourt/Career/College
enwiki:%C3%81lvaro%20Betancourt/Career/Early%20career%20in%20Puerto%20Rico
enwiki:%C3%81lvaro%20Betancourt/Career/Early%20career%20in%20Puerto%20Rico
enwiki:%C3%81lvaro%20Betancourt/Career/Return%20from%20college
enwiki:%C3%81lvaro%20Betancourt/Career/Return%20from%20college
enwiki:%C3%81lvaro%20Betancourt/Early%20life
enwiki:%C3%81lvaro%20Betancourt/International%20career
enwiki:%C3%81lvaro%20Betancourt/International%20career
enwiki:%C3%81lvaro%20Betancourt/International%20career/Senior%20career
enwiki:%C3%81lvaro%20Betancourt/International%20career/Senior%20career
