#This script explores the wikimark dataset and creates a pairwise version of the same.

In [15]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from data.DataProcessor import get_section_tree, get_distance
import itertools
import plotly.express as px

wikimark_path = 'D://wikimarks_data//en-wiki-01012022'

In [16]:
def get_qrels(qrels_path):
    qrels = {}
    with open(qrels_path, 'r') as f:
        for l in f:
            label = l.split(' ')[0]
            page = label.split('/')[0]
            para = l.split(' ')[2]
            if page not in qrels.keys():
                qrels[page] = {para: label}
            else:
                qrels[page][para] = label
    return qrels

In [17]:
train_hier_qrels_path = wikimark_path + '//benchmarks//good-articles//good-articles.train//train.pages.cbor-hierarchical.qrels'
test_hier_qrels_path = wikimark_path + '//benchmarks//good-articles//good-articles.test//test.pages.cbor-hierarchical.qrels'
train_qrels = get_qrels(train_hier_qrels_path)
test_qrels = get_qrels(test_hier_qrels_path)
train_section_counts = {}
for p in train_qrels.keys():
    train_section_counts[p] = Counter([train_qrels[p][para] for para in train_qrels[p].keys()])

In [18]:
def print_qrels_stats(qrels, section_counts):
    print('No of pages: %d' % len(qrels))
    num_paras = np.array([len(qrels[p]) for p in qrels.keys()])
    print('Total paras: %d' % np.sum(num_paras))
    print('Mean paras per page: %.2f, std: %.2f, max: %d, min: %d' % (np.mean(num_paras), np.std(num_paras), np.max(num_paras), np.min(num_paras)))
    num_sections = np.array([len(list(section_counts[p])) for p in section_counts.keys()])
    print('Mean sections per page: %.2f, std: %.2f, max: %d, min: %d' % (np.mean(num_sections), np.std(num_sections), np.max(num_sections), np.min(num_sections)))
print_qrels_stats(train_qrels, train_section_counts)

No of pages: 17357
Total paras: 415035
Mean paras per page: 23.91, std: 19.53, max: 289, min: 3
Mean sections per page: 8.80, std: 6.17, max: 73, min: 2


In [19]:
def get_qrels_distances(qrels):
    qrels_section_trees = {}
    for page in qrels.keys():
        sections = list(set([qrels[page][para] for para in qrels[page].keys()]))
        qrels_section_trees[page] = get_section_tree(sections)
    qrels_distances = {}
    for page in qrels.keys():
        paras = list(qrels[page].keys())
        distances = []
        for p1, p2 in itertools.combinations(paras, 2):
            sec1 = qrels[page][p1]
            sec2 = qrels[page][p2]
            distances.append(get_distance(sec1, sec2, qrels_section_trees[page]))
        qrels_distances[page] = distances
    return qrels_distances

In [20]:
train_qrels_distances = get_qrels_distances(train_qrels)
test_qrels_distances = get_qrels_distances(test_qrels)
pagelist = list(train_qrels.keys())
i = 3
pagedist = np.array(train_qrels_distances[pagelist[i]])
fig = px.histogram(pagedist)
fig.show()

In [21]:
train_qrels_distances.keys()



In [22]:
print(pagelist[i])
paras = train_qrels[pagelist[i]]
print(len(paras))
pairs = [p for p in itertools.combinations(paras, 2)]
print(len(pairs))
for p in paras:
    print(train_qrels[pagelist[i]][p])

enwiki:%C3%81lvaro%20Betancourt
13
78
enwiki:%C3%81lvaro%20Betancourt
enwiki:%C3%81lvaro%20Betancourt
enwiki:%C3%81lvaro%20Betancourt
enwiki:%C3%81lvaro%20Betancourt/Career/College
enwiki:%C3%81lvaro%20Betancourt/Career/Early%20career%20in%20Puerto%20Rico
enwiki:%C3%81lvaro%20Betancourt/Career/Early%20career%20in%20Puerto%20Rico
enwiki:%C3%81lvaro%20Betancourt/Career/Return%20from%20college
enwiki:%C3%81lvaro%20Betancourt/Career/Return%20from%20college
enwiki:%C3%81lvaro%20Betancourt/Early%20life
enwiki:%C3%81lvaro%20Betancourt/International%20career
enwiki:%C3%81lvaro%20Betancourt/International%20career
enwiki:%C3%81lvaro%20Betancourt/International%20career/Senior%20career
enwiki:%C3%81lvaro%20Betancourt/International%20career/Senior%20career


In [23]:
pairs

[('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  '70e0457b7b4b1f97b6f0226be1f4f45bca704773'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  '83a5401831f28a9b1d6bd623adaad15f897c1d65'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  'e735c3940c7ffb81f9c30bac5e2070b48ee80f1c'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  '91de6efca0a74fbc5fa147c7c95025c08ad9f89e'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  'e72c1e7234ed9890b8f9fec6e18cac1ecc8343ed'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  'e15e77128b1643df857d0c22bf816c5fc32a5933'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  'f5cac04108f3bb04f115c88b104066e4a610a29a'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  'b9201bdfd6356e7489b698302199efde455f1d53'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  '2b7814821b7e13c3574f3bd79beea538bd225ba0'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  'cf402fb1667b094341f4c61842dd5f6691584e3f'),
 ('19457a85a41681219fd17ef5ffac32f5e8c25b69',
  '188f18d636b5ebe883e8e