# Tutorial/Collection for using the different APIs



In [1]:
import os, sys
import datetime
import calendar
import time
import string
import requests
import pandas as pd
print('ready')

ready


# MediawikiAPI

In [2]:
page_title = 'Ludwig Boltzmann'
api_url_base = 'https://en.wikipedia.org/w/api.php'

#### search

In [3]:
params = {
    "action": "query",
    "list": "search",
    "srsearch": page_title,
    "format": "json",
}
response = requests.get( api_url_base,params=params).json()
result = response['query']['search']
print(result)

[{'ns': 0, 'title': 'Ludwig Boltzmann', 'pageid': 544255, 'size': 34008, 'wordcount': 3717, 'snippet': '<span class="searchmatch">Ludwig</span> Eduard <span class="searchmatch">Boltzmann</span> (German pronunciation: [ˈluːtvɪg ˈbɔlt͡sman]; February 20, 1844 – September 5, 1906) was an Austrian physicist and philosopher', 'timestamp': '2020-07-13T15:11:33Z'}, {'ns': 0, 'title': 'Boltzmann brain', 'pageid': 9638200, 'size': 23433, 'wordcount': 2685, 'snippet': 'response to <span class="searchmatch">Ludwig</span> <span class="searchmatch">Boltzmann</span>\'s early explanation for the low-entropy state of our universe. In this physics thought experiment, a <span class="searchmatch">Boltzmann</span> brain is a', 'timestamp': '2020-07-29T20:07:45Z'}, {'ns': 0, 'title': 'Boltzmann constant', 'pageid': 53702, 'size': 21643, 'wordcount': 2434, 'snippet': 'named after the Austrian scientist <span class="searchmatch">Ludwig</span> <span class="searchmatch">Boltzmann</span>. As part of the 2019 re

#### page-props

In [4]:
params = {
    "action": "query",
    "titles": page_title,
    "prop": "pageprops",
    "format": "json",
}
response = requests.get( api_url_base,params=params).json()
result = response['query']['pages']
print(result)

{'544255': {'pageid': 544255, 'ns': 0, 'title': 'Ludwig Boltzmann', 'pageprops': {'defaultsort': 'Boltzmann, Ludwig', 'page_image_free': 'Boltzmann2.jpg', 'wikibase_item': 'Q84296'}}}


#### links

In [5]:
params = {
    "action": "query",
    "titles": page_title,
    "prop": "links",
    "format": "json",
}
response = requests.get( api_url_base,params=params).json()
result = response['query']['pages']
print(result)

{'544255': {'pageid': 544255, 'ns': 0, 'title': 'Ludwig Boltzmann', 'links': [{'ns': 0, 'title': 'Albert Einstein'}, {'ns': 0, 'title': 'Albert von Ettingshausen'}, {'ns': 0, 'title': 'Amedeo Avogadro'}, {'ns': 0, 'title': 'ArXiv (identifier)'}, {'ns': 0, 'title': 'Atom'}, {'ns': 0, 'title': 'Atomic mass'}, {'ns': 0, 'title': 'Atomic theory'}, {'ns': 0, 'title': 'Austria'}, {'ns': 0, 'title': 'Austria-Hungary'}, {'ns': 0, 'title': 'Austrian Academy of Sciences'}]}}


#### date-creation

In [6]:
## get timestamp of first revision
## from: https://stackoverflow.com/questions/43898352/how-to-get-date-of-creation-of-wikipedia-page-by-api
params = {
    "action": "query",
    "titles": page_title,
    "prop": "revisions",
    "rvlimit":1,
    "rvprop":"timestamp",
    "rvdir":"newer",
    "format": "json",
}
response = requests.get( api_url_base,params=params).json()
result = response['query']['pages']
print(result)

{'544255': {'pageid': 544255, 'ns': 0, 'title': 'Ludwig Boltzmann', 'revisions': [{'timestamp': '2004-03-22T15:59:45Z'}]}}


## Edits and Pageviews

#### pageviews

In [7]:
def getViews(page,start,end,project):
    """
    get pageviews using this API https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews 
    page: str (article name)
    start: str start date YYYYMMDD (20200101)
    end: str end date YYYYMMDD (20200103)
    project: str, ex: en.wikipedia (project does not include .org)

    """
    base_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/%s/all-access/all-agents/%s/daily/%s/%s" % (project,page,start,end)
    try:
        data = requests.get(base_url).json()['items']
        df = pd.DataFrame(data) [['views','timestamp']]
        df.rename(columns={'views':page},inplace=True)
    except KeyError:
        ## no pageviews- we have to set 1 date with 0 counts
        df = pd.DataFrame(columns=[page,'timestamp'],index=[0])
        df.iloc[0,0] = 0
        df.iloc[0,1] = start
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
    df.set_index('timestamp',inplace=True)
    return df

In [8]:
start = '20200101'
end = '20200103'
df_views = getViews(page_title,start,end,'en.wikipedia')
df_views

Unnamed: 0_level_0,Ludwig Boltzmann
timestamp,Unnamed: 1_level_1
2020-01-01,401
2020-01-02,397
2020-01-03,375


In [9]:
def getEdits(page,start,end,project):
    """
    get pageviews using this API https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews 
    page: str (article name)
    start: str start date YYYYMMDD (20200101)
    end: str end date YYYYMMDD (20200103)
    project: str, ex: en.wikipedia (project does not include .org)

    """
    base_url = "https://wikimedia.org/api/rest_v1/metrics/edits/per-page/%s/%s/all-editor-types/daily/%s/%s" % (project,page,start,end)
    print(base_url)
    try:
        data = requests.get(base_url).json()['items'][0]['results']
        df = pd.DataFrame(data) [['edits','timestamp']]
        df.rename(columns={'edits':page},inplace=True)
    except KeyError:
        ## no pageviews- we have to set 1 date with 0 counts
        df = pd.DataFrame(columns=[page,'timestamp'],index=[0])
        df.iloc[0,0] = 0
        df.iloc[0,1] = start
    df['timestamp'] = pd.to_datetime(df['timestamp'].apply(lambda x:x.split('T')[0]), format='%Y-%m-%d')
    df.set_index('timestamp',inplace=True)
    return df

In [10]:
page_title = 'Coronavirus'
start = '20200701'
end = '20200710'
df_edits = getEdits(page_title,start,end,'en.wikipedia')
df_edits
# https://wikimedia.org/api/rest_v1/metrics/edits/per-page/en.wikipedia/Coronavirus/all-editor-types/daily/20200701/20200710

https://wikimedia.org/api/rest_v1/metrics/edits/per-page/en.wikipedia/Coronavirus/all-editor-types/daily/20200701/20200710


Unnamed: 0_level_0,Coronavirus
timestamp,Unnamed: 1_level_1
2020-07-01,0
2020-07-02,0
2020-07-03,0
2020-07-04,0
2020-07-05,0
2020-07-06,2
2020-07-07,0
2020-07-08,0
2020-07-09,0


## Redirects

all pages that redirect to the current page

In [11]:
def getRedirects(page_title,wiki):
    '''
    get all redirects (upto 500)
    for a given page 
    '''
    api_url_base = 'https://%s.wikipedia.org/w/api.php'%( wiki.replace('wiki','') )
    params = {
        "action": "query",
        "titles": page_title,
        "prop": "redirects",
        "format": "json",
        "rdlimit":500,
    }
    try:
        response = requests.get( api_url_base,params=params).json()
        if 'query' in response:
            if 'pages' in response['query']:
                return response
    except:
        return []
project = 'enwiki'
page = 'Ludwig Boltzmann'
result = getRedirects(page,project)
redirects = list(result['query']['pages'].values())[0].get('redirects',[])
redirects

[{'pageid': 258974, 'ns': 0, 'title': 'Boltzmann'},
 {'pageid': 1753539, 'ns': 0, 'title': 'Ludwig Botzmann'},
 {'pageid': 1808494, 'ns': 0, 'title': 'Ludwig Eduard Boltzmann'},
 {'pageid': 2568913, 'ns': 0, 'title': 'L. Boltzmann'},
 {'pageid': 4889545, 'ns': 0, 'title': 'Ludwig boltzman'},
 {'pageid': 4889552, 'ns': 0, 'title': 'Ludwig Boltzman'},
 {'pageid': 6669251, 'ns': 0, 'title': 'Ludwig Bolzmann'},
 {'pageid': 41687770, 'ns': 0, 'title': 'Boltzmannian'}]