In [2]:
import pandas as pd
import requests
import json
import matplotlib
%matplotlib inline

# (REST) APIs - Wikimedia Stats

In [3]:
url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/all-access/all-agents/daily/20180101/20191231"
headers = {"accept": "application/json"}

In [6]:
type(headers)

dict

In [9]:
json.dumps(headers)

'{"accept": "application/json"}'

In [15]:
r = requests.get(url)

In [16]:
r.status_code

200

In [17]:
r.headers

{'cache-control': 's-maxage=86400, max-age=86400', 'content-type': 'application/json; charset=utf-8', 'x-request-id': 'd3f79caf-6553-4fad-acb1-7fda99499a62', 'server': 'restbase1023', 'date': 'Tue, 07 Apr 2020 07:58:49 GMT', 'access-control-allow-origin': '*', 'access-control-allow-methods': 'GET,HEAD', 'access-control-allow-headers': 'accept, content-type, content-length, cache-control, accept-language, api-user-agent, if-match, if-modified-since, if-none-match, dnt, accept-encoding', 'access-control-expose-headers': 'etag', 'x-content-type-options': 'nosniff', 'x-frame-options': 'SAMEORIGIN', 'referrer-policy': 'origin-when-cross-origin', 'x-xss-protection': '1; mode=block', 'content-security-policy': "default-src 'none'; frame-ancestors 'none'", 'x-content-security-policy': "default-src 'none'; frame-ancestors 'none'", 'x-webkit-csp': "default-src 'none'; frame-ancestors 'none'", 'Content-Encoding': 'gzip', 'Vary': 'Accept-Encoding', 'Age': '1448', 'X-Cache': 'cp3056 miss, cp3062 hi

In [19]:
type(r.text)

str

In [21]:
type(r.json())

dict

In [23]:
r.json().keys()

dict_keys(['items'])

In [26]:
wikistats = r.json()["items"]

In [34]:
wikistats[:3]

[{'project': 'en.wikipedia',
  'access': 'all-access',
  'agent': 'all-agents',
  'granularity': 'daily',
  'timestamp': '2018010100',
  'views': 302828330},
 {'project': 'en.wikipedia',
  'access': 'all-access',
  'agent': 'all-agents',
  'granularity': 'daily',
  'timestamp': '2018010200',
  'views': 319485738},
 {'project': 'en.wikipedia',
  'access': 'all-access',
  'agent': 'all-agents',
  'granularity': 'daily',
  'timestamp': '2018010300',
  'views': 322019675}]

In [27]:
df = pd.DataFrame(wikistats)
df.head()

Unnamed: 0,project,access,agent,granularity,timestamp,views
0,en.wikipedia,all-access,all-agents,daily,2018010100,302828330
1,en.wikipedia,all-access,all-agents,daily,2018010200,319485738
2,en.wikipedia,all-access,all-agents,daily,2018010300,322019675
3,en.wikipedia,all-access,all-agents,daily,2018010400,314645741
4,en.wikipedia,all-access,all-agents,daily,2018010500,310155899


In [50]:
def wiki_to_df(url):
    headers = {"accept": "application/json"}
    response = requests.get(url)
    if response.status_code == 200:
        wikistats = response.json()["items"]
        df = pd.DataFrame(wikistats)
        df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y%m%d%H")
        return df
    else:
        print("No success!")

In [31]:
df.dtypes

project        object
access         object
agent          object
granularity    object
timestamp      object
views           int64
dtype: object

In [45]:
pd.to_datetime(df["timestamp"].str.slice(stop=-2))

0     2018-01-01
1     2018-01-02
2     2018-01-03
3     2018-01-04
4     2018-01-05
         ...    
725   2019-12-27
726   2019-12-28
727   2019-12-29
728   2019-12-30
729   2019-12-31
Name: timestamp, Length: 730, dtype: datetime64[ns]

In [47]:
df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y%m%d%H")

In [48]:
df.dtypes

project                object
access                 object
agent                  object
granularity            object
timestamp      datetime64[ns]
views                   int64
dtype: object

Get data on a more granular level:

Wikimedia stats API offers more options. That is, pageview by
- (day)
- I. access type
- II. agent type

### I. by access type

In [51]:
#url =     "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/all-access/all-agents/daily/20180101/20191231"
url_acc = "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/desktop/all-agents/daily/20180101/20191231"
wiki_to_df(url_acc)

Unnamed: 0,project,access,agent,granularity,timestamp,views
0,en.wikipedia,desktop,all-agents,daily,2018-01-01,149207888
1,en.wikipedia,desktop,all-agents,daily,2018-01-02,176949374
2,en.wikipedia,desktop,all-agents,daily,2018-01-03,189221424
3,en.wikipedia,desktop,all-agents,daily,2018-01-04,181752882
4,en.wikipedia,desktop,all-agents,daily,2018-01-05,177118102
...,...,...,...,...,...,...
725,en.wikipedia,desktop,all-agents,daily,2019-12-27,134979884
726,en.wikipedia,desktop,all-agents,daily,2019-12-28,127967591
727,en.wikipedia,desktop,all-agents,daily,2019-12-29,130540209
728,en.wikipedia,desktop,all-agents,daily,2019-12-30,157783621


In [52]:
url_ma = "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/mobile-app/all-agents/daily/20180101/20191231"
wiki_to_df(url_ma)

Unnamed: 0,project,access,agent,granularity,timestamp,views
0,en.wikipedia,mobile-app,all-agents,daily,2018-01-01,5088222
1,en.wikipedia,mobile-app,all-agents,daily,2018-01-02,4432088
2,en.wikipedia,mobile-app,all-agents,daily,2018-01-03,3986792
3,en.wikipedia,mobile-app,all-agents,daily,2018-01-04,3943742
4,en.wikipedia,mobile-app,all-agents,daily,2018-01-05,3977314
...,...,...,...,...,...,...
725,en.wikipedia,mobile-app,all-agents,daily,2019-12-27,5270968
726,en.wikipedia,mobile-app,all-agents,daily,2019-12-28,5592373
727,en.wikipedia,mobile-app,all-agents,daily,2019-12-29,6018164
728,en.wikipedia,mobile-app,all-agents,daily,2019-12-30,5430936


In [53]:
df.columns

Index(['project', 'access', 'agent', 'granularity', 'timestamp', 'views'], dtype='object')

In [57]:
access_types = ["desktop", "mobile-app", "mobile-web"]

wiki = pd.DataFrame(columns=df.columns)
for at in access_types:
    endpoint = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/{at}/all-agents/daily/20180101/20191231"
    wiki = pd.concat([wiki, wiki_to_df(endpoint)])

In [59]:
wiki["access"].value_counts()

mobile-web    730
desktop       730
mobile-app    730
Name: access, dtype: int64

### II. by agent type

In [61]:
agent_types = ["user", "spider"]

wiki = pd.DataFrame(columns=df.columns)

for at in access_types:
    for agt in agent_types:
        endpoint = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/{at}/{agt}/daily/20180101/20191231"
        wiki = pd.concat([wiki, wiki_to_df(endpoint)])


In [62]:
wiki.head()

Unnamed: 0,project,access,agent,granularity,timestamp,views
0,en.wikipedia,desktop,user,daily,2018-01-01,103246409
1,en.wikipedia,desktop,user,daily,2018-01-02,132232882
2,en.wikipedia,desktop,user,daily,2018-01-03,141986950
3,en.wikipedia,desktop,user,daily,2018-01-04,132734349
4,en.wikipedia,desktop,user,daily,2018-01-05,128986098


In [63]:
wiki["access"].value_counts()

mobile-web    1460
desktop       1460
mobile-app     941
Name: access, dtype: int64

In [64]:
wiki["agent"].value_counts()

user      2190
spider    1671
Name: agent, dtype: int64

In [65]:
wiki.groupby(["access", "agent"]).size()

access      agent 
desktop     spider    730
            user      730
mobile-app  spider    211
            user      730
mobile-web  spider    730
            user      730
dtype: int64

In [66]:
wiki.loc[(wiki["access"] == "mobile-app") & (wiki["agent"] == "spider")]

Unnamed: 0,project,access,agent,granularity,timestamp,views
0,en.wikipedia,mobile-app,spider,daily,2018-01-02,2
1,en.wikipedia,mobile-app,spider,daily,2018-01-07,9
2,en.wikipedia,mobile-app,spider,daily,2018-01-13,1
3,en.wikipedia,mobile-app,spider,daily,2018-01-15,2
4,en.wikipedia,mobile-app,spider,daily,2018-01-16,2
...,...,...,...,...,...,...
206,en.wikipedia,mobile-app,spider,daily,2019-12-27,387
207,en.wikipedia,mobile-app,spider,daily,2019-12-28,578
208,en.wikipedia,mobile-app,spider,daily,2019-12-29,599
209,en.wikipedia,mobile-app,spider,daily,2019-12-30,568
