In [2]:
import pandas as pd
import requests
import json
import matplotlib
%matplotlib inline

# (REST) APIs - Wikimedia Stats

In [None]:
url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/all-access/all-agents/daily/20180101/20191231"
headers = {"accept": "application/json"}

In [None]:
type(headers)

In [None]:
json.dumps(headers)

In [None]:
r = requests.get(url)

In [None]:
r.status_code

In [None]:
r.headers

In [None]:
type(r.text)

In [None]:
type(r.json())

In [None]:
r.json().keys()

In [None]:
wikistats = r.json()["items"]

In [None]:
wikistats[:3]

In [None]:
df = pd.DataFrame(wikistats)
df.head()

In [None]:
def wiki_to_df(url):
    headers = {"accept": "application/json"}
    response = requests.get(url)
    if response.status_code == 200:
        wikistats = response.json()["items"]
        df = pd.DataFrame(wikistats)
        df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y%m%d%H")
        return df
    else:
        print("No success!")

In [None]:
df.dtypes

In [None]:
pd.to_datetime(df["timestamp"].str.slice(stop=-2))

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y%m%d%H")

In [None]:
df.dtypes

Get data on a more granular level:

Wikimedia stats API offers more options. That is, pageview by
- (day)
- I. access type
- II. agent type

### I. by access type

In [None]:
#url =     "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/all-access/all-agents/daily/20180101/20191231"
url_acc = "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/desktop/all-agents/daily/20180101/20191231"
wiki_to_df(url_acc)

In [None]:
url_ma = "https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/mobile-app/all-agents/daily/20180101/20191231"
wiki_to_df(url_ma)

In [None]:
df.columns

In [None]:
access_types = ["desktop", "mobile-app", "mobile-web"]

wiki = pd.DataFrame(columns=df.columns)
for at in access_types:
    endpoint = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/{at}/all-agents/daily/20180101/20191231"
    wiki = pd.concat([wiki, wiki_to_df(endpoint)])

In [None]:
wiki["access"].value_counts()

### II. by agent type

In [None]:
agent_types = ["user", "spider"]

wiki = pd.DataFrame(columns=df.columns)

for at in access_types:
    for agt in agent_types:
        endpoint = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/en.wikipedia.org/{at}/{agt}/daily/20180101/20191231"
        wiki = pd.concat([wiki, wiki_to_df(endpoint)])


In [None]:
wiki.head()

In [None]:
wiki["access"].value_counts()

In [None]:
wiki["agent"].value_counts()

In [None]:
wiki.groupby(["access", "agent"]).size()

In [None]:
wiki.loc[(wiki["access"] == "mobile-app") & (wiki["agent"] == "spider")]

In [None]:
# export data
# df.to_pickle("data/total_pageviews.pkl")
# wiki.to_pickle("data/granular_pageivews.pkl")