Wikipedia rest API located at: https://wikimedia.org/api/rest_v1/#/




In [1]:
import pandas as pd
import time


## Getting pageview data

In [15]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{w}/{x}/{y}/daily/20100101/20220101"
l1 = ['desktop','mobile-app','mobile-web']
l2 = ['user','spider','automated']
l3 = ['commons.wikimedia.org','en.wikipedia.org']

combined = [(f,s,v) for f in l1 for s in l2 for v in l3]

for i,j in enumerate(combined):
    d = pd.DataFrame(pd.read_json(base_html.format(x=j[0],y=j[1], w = j[2]))['items'].values.tolist())
    if i == 0:
        pageviews = d
    else: 
        pageviews = pd.concat([pageviews,d])
pageviews.to_csv("./data/pageviews_aggregate.csv", index = False)

In [5]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/pageviews/top/{w}/all-access/{cyear}/{cmo}/{cday}"

yearList = ["2015","2016","2017","2018","2019","2020","2021"]
monthList = [f"0{i}" for i in range(1,10)] + [i for i in range(10,13)]
dayList = [f"0{i}" for i in range(1,10)] + [i for i in range(10,32)]
l3 = ['commons.wikimedia.org','en.wikipedia.org']

combined = [(a,b,c,d) for a in yearList for b in monthList for c in dayList for d in l3]

for i,j in enumerate(combined):
    time.sleep(0.25)
    try:
        d = pd.json_normalize(pd.read_json(base_html.format(cyear = j[0],cmo = j[1],cday=j[2],w = j[3]))['items'], 
                                            record_path =['articles'], 
                                            meta =['project','access','year','month','day'])
        if i == 0:
            top_pages = d
        else: 
            top_pages = pd.concat([top_pages,d])
            
    except: xxx=5
            
top_pages.reset_index().to_csv("./data/pageviews_top.csv", index = False)

## Unique devices count by day

In [17]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/unique-devices/en.wikipedia.org/{x}/daily/20150101/20220101"
d = pd.DataFrame(pd.read_json(base_html.format(x='desktop-site'))['items'].values.tolist())
ma = pd.DataFrame(pd.read_json(base_html.format(x='mobile-site'))['items'].values.tolist())

devices = pd.concat([d,ma])

devices.reset_index().to_csv("./data/unique_devices.csv", index = False)

# Edited pages data

## New pages by day

In [18]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/edited-pages/new/en.wikipedia/{x}/{y}/daily/20150101/20220101"
             
l1 = ['anonymous','group-bot','name-bot','user']
l2 = ['content','non-content']

combined = [(f,s) for f in l1 for s in l2]

for i,j in enumerate(combined):
    d = pd.json_normalize(pd.read_json(base_html.format(x=j[0],y=j[1]))['items'], 
                                        record_path =['results'], 
                                        meta =['project','editor-type','page-type','granularity'])
    if i == 0:
        new_pages = d
    else: 
        new_pages = pd.concat([new_pages,d])
new_pages.reset_index().to_csv("./data/edited_pages_new.csv", index = False)

## Edited pages counts

In [19]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/edited-pages/aggregate/en.wikipedia/{x}/{y}/all-activity-levels/daily/20150101/20220101"
l1 = ['anonymous','group-bot','name-bot','user']
l2 = ['content','non-content']

combined = [(f,s) for f in l1 for s in l2]

for i,j in enumerate(combined):
    d = pd.json_normalize(pd.read_json(base_html.format(x=j[0],y=j[1]))['items'], 
                                        record_path =['results'], 
                                        meta =['project','editor-type','page-type','activity-level','granularity'])
    if i == 0:
        edited_pages = d
    else: 
        edited_pages = pd.concat([edited_pages,d])
edited_pages.reset_index().to_csv("./data/edited_pages.csv", index = False)

## Editors counts

In [20]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/editors/aggregate/en.wikipedia/{x}/{y}/{z}/daily/20100101/20220101"
l1 = ['anonymous','group-bot','name-bot','user']
l2 = ['content','non-content']
l3 = ['1..4-edits','5..24-edits','25..99-edits','100..-edits']

combined = [(f,s,a) for f in l1 for s in l2 for a in l3]

for i,j in enumerate(combined):
    d = pd.json_normalize(pd.read_json(base_html.format(x=j[0],y=j[1],z=j[2]))['items'], 
                                        record_path =['results'], 
                                        meta =['project','editor-type','page-type','activity-level','granularity'])
    if i == 0:
        edit_counts = d
    else: 
        edit_counts = pd.concat([edit_counts,d])
edit_counts.reset_index().to_csv("./data/editors.csv", index = False)
    

## Edits

In [21]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/edits/aggregate/en.wikipedia/{x}/{y}/daily/20150101/20220101"
l1 = ['anonymous','group-bot','name-bot','user']
l2 = ['content','non-content']

combined = [(f,s) for f in l1 for s in l2]

for i,j in enumerate(combined):
    d = pd.json_normalize(pd.read_json(base_html.format(x=j[0],y=j[1]))['items'], 
                                        record_path =['results'], 
                                        meta =['project','editor-type','page-type','granularity'])
    if i == 0:
        edits = d
    else: 
        edits = pd.concat([edits,d])
edits.reset_index().to_csv("./data/edits.csv", index = False)

## new users

In [22]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/registered-users/new/en.wikipedia/daily/20100101/20220101"
    
new_users = pd.json_normalize(pd.read_json(base_html)['items'], 
              record_path =['results'], 
              meta =['project','granularity'])
        
new_users.reset_index().to_csv("./data/registered_users.csv", index = False)

## Edit byte difference

In [23]:

base_html = "https://wikimedia.org/api/rest_v1/metrics/bytes-difference/net/aggregate/en.wikipedia/{x}/{y}/daily/20150101/20220101"
l1 = ['anonymous','group-bot','name-bot','user']
l2 = ['content','non-content']

combined = [(f,s) for f in l1 for s in l2]

for i,j in enumerate(combined):
    d = pd.json_normalize(pd.read_json(base_html.format(x=j[0],y=j[1]))['items'], 
                                        record_path =['results'], 
                                        meta =['project','editor-type','page-type','granularity'])
    if i == 0:
        diff_sum = d
    else: 
        diff_sum = pd.concat([diff_sum,d])
diff_sum.reset_index().to_csv("./data/bytes_difference_net.csv", index = False)

In [24]:
base_html = "https://wikimedia.org/api/rest_v1/metrics/bytes-difference/absolute/aggregate/en.wikipedia/{x}/{y}/daily/20150101/20220101"
l1 = ['anonymous','group-bot','name-bot','user']
l2 = ['content','non-content']

combined = [(f,s) for f in l1 for s in l2]

for i,j in enumerate(combined):
    d = pd.json_normalize(pd.read_json(base_html.format(x=j[0],y=j[1]))['items'], 
                                        record_path =['results'], 
                                        meta =['project','editor-type','page-type','granularity'])
    if i == 0:
        diff_abs = d
    else: 
        diff_abs = pd.concat([diff_abs,d])
        
diff_abs.reset_index().to_csv("./data/bytes_difference_absolute.csv", index = False)