In [1]:
from mwviews.api import PageviewsClient
import mwapi
import wikitools
import pandas as pd
import time


my_agent = 'mwapi testing <p.gildersleve@lse.ac.uk>'
async_session = mwapi.AsyncSession('https://en.wikipedia.org',
                    formatversion=2, user_agent=my_agent)


In [2]:
toparts = pd.read_csv('data/topviews-2024_07_31.csv')
artlist = toparts['Page'].unique().tolist() # ~1000 top articles yesterday

In [3]:
links_data = await wikitools.pipeline_get_links('en.wikipedia', my_agent,
                                                titles=artlist,
                                                gl_args={'mode':['out', 'in'],
                                                        'update_maps':True})
links = links_data['links']
id_map = links_data['id_map']
redirect_map = links_data['redirect_map']
norm_map = links_data['norm_map']

# Get list of articles and edges (excluding redlinks, those with no page id)
articles = set([x['title'] for y in links['in'].values() for x in y] +
                [x['title'] for y in links['out'].values() for x in y if id_map[x['title']] != -1])
edges = set([(k, x['title']) for k, v in links['out'].items() for x in v if id_map[x['title']] != -1] +
            [(x['title'], k) for k, v in links['in'].items()for x in v])

print(len(links['in']), len(links['out']))
print(len(id_map), len(redirect_map), len(norm_map))
print(len(articles), len(edges))


Getting out-links
Getting in-links
('MediaWiki returned an error:', 'Could not decode as JSON:\n<!DOCTYPE html>\n<html lang="en">\n<meta charset="utf-8">\n<title>Wikimedia Error</title>\n<style>\n* { margin: 0; padding: 0; }\nbody { background: #fff; font: 15px/1.6 sans-serif; color: #333; }\n.content { margin: 7% auto 0; padding: 2em 1em 1em; max-width: 640px; }\n.footer { clear: both; margin-top: 14%; border-top: 1px solid #e5e5e5; background: #f9f')
0.00% complete
Trying again at n=0 with batchsize=100
Increasing batchsize to 200
982 982
2598674 97717 0
2589963 6183123


In [4]:
edges_series = pd.DataFrame(edges, columns=['source', 'target'])
edges_series

Unnamed: 0,source,target
0,1908 in aviation,France
1,Basketball at the 2016 Summer Olympics – Men's...,United States
2,Clyde Everett Lassen,Vietnam War
3,Scatman John discography,YouTube
4,Olympic Games,Wheelchair handball
...,...,...
6183118,Crocodile (Black Mirror),Twitter
6183119,2018 US Open – Women's singles,Netherlands
6183120,Emanuel Celler,New York City
6183121,Namtsy,Russia


In [5]:
# get ordered list - hoping to make collection of articles more efficient for subsequent API calls
# not sure if / how much this will actually help

top_indegree = edges_series['target'].value_counts()
ordered_artlist = ([x for x in top_indegree.index if x in artlist] +
                   [x for x in artlist if x not in top_indegree.index])[::-1]
ordered_artlist[-20:]

['New Zealand',
 'Soviet Union',
 'English language',
 'World War I',
 'Turkey',
 'Iran',
 'Switzerland',
 'Netherlands',
 'China',
 'New York City',
 'Russia',
 'Canada',
 'Japan',
 'Australia',
 'World War II',
 'India',
 'United Kingdom',
 'Germany',
 'France',
 'United States']

In [6]:
new_links_data = await wikitools.pipeline_get_links('en.wikipedia', my_agent,
                                                titles=ordered_artlist,
                                                gl_args={'mode':['out', 'in'],
                                                        'update_maps':True})
new_links = new_links_data['links']
new_id_map = new_links_data['id_map']
new_redirect_map = new_links_data['redirect_map']
new_norm_map = new_links_data['norm_map']

# Get list of articles and edges (excluding redlinks, those with no page id)
new_articles = set([x['title'] for y in new_links['in'].values() for x in y] +
                [x['title'] for y in new_links['out'].values() for x in y if new_id_map[x['title']] != -1])
new_edges = set([(k, x['title']) for k, v in new_links['out'].items() for x in v if new_id_map[x['title']] != -1] +
            [(x['title'], k) for k, v in new_links['in'].items()for x in v])

print(len(new_links['in']), len(new_links['out']))
print(len(new_id_map), len(new_redirect_map), len(new_norm_map))
print(len(new_articles), len(new_edges))

Getting out-links
Getting in-links
('MediaWiki returned an error:', 'Could not decode as JSON:\n<!DOCTYPE html>\n<html lang="en">\n<meta charset="utf-8">\n<title>Wikimedia Error</title>\n<style>\n* { margin: 0; padding: 0; }\nbody { background: #fff; font: 15px/1.6 sans-serif; color: #333; }\n.content { margin: 7% auto 0; padding: 2em 1em 1em; max-width: 640px; }\n.footer { clear: both; margin-top: 14%; border-top: 1px solid #e5e5e5; background: #f9f')
40.49% complete
Trying again at n=400 with batchsize=100
Increasing batchsize to 200
('MediaWiki returned an error:', 'Could not decode as JSON:\n<!DOCTYPE html>\n<html lang="en">\n<meta charset="utf-8">\n<title>Wikimedia Error</title>\n<style>\n* { margin: 0; padding: 0; }\nbody { background: #fff; font: 15px/1.6 sans-serif; color: #333; }\n.content { margin: 7% auto 0; padding: 2em 1em 1em; max-width: 640px; }\n.footer { clear: both; margin-top: 14%; border-top: 1px solid #e5e5e5; background: #f9f')
70.85% complete
Trying again at n=70