# Get Page Views 
This notebook takes as input files containing the covid related pages per language (wiki_db) and retrieves all the users pageviews (filterting out bots and other automated traffic) to those pages and their redirects. 

For example given an article Covid-19 Pandemic in English Wikipedia the script does the following:

* Gathers all the pages that [redirects](https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/COVID-19_pandemic&hidetrans=1&hidelinks=1&limit=500) to that article.  (eg. 2019-20 outbreak of novel coronavirus, Outbreak of novel coronavirus (2019-nCoV),etc) 
* Then take all the pageviews coming from users, filtering out bots and other automated traffic.
* And finally adding all the pageviews for the target and redirects, day by day. 


## Get Pageviews with Redirects
This notebook uses the [Wikimedia Pageviews API](https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews) to get the views on articles related with Covid.

Note that we solve all the redirects, meaning that for a view on page X, we take all the articles that redirects to X and add them to the total views of X.



In [1]:
import requests
import pandas as pd
### Functions to get pageviews per page
### I'm not using mwviews package for this, because that have problems with non-latin characters


def getRedirects(page,project):
    '''
    get all redirects (upto 500)
    for a given page 
    '''
    base_url = 'https://%s.org/w/api.php?action=query&titles=%s&prop=redirects&rdlimit=500&format=json' % (project,page)
    data = [p['title'] for p in list(requests.get(base_url).json()['query']['pages'].values())[0]['redirects']]
    return data
    

def getViews(page,start,end,project,user_agent='user'):
    """
    get pageviews using this API https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews 
    page: str (article name)
    start: str start date YYYYMMDD (20200101)
    end: str end date YYYYMMDD (20200103)
    project: str, ex: en.wikipedia (project does not include .org)
    user_agent: str, 'user','spider','automated','all-agents'

    """
    base_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/%s/all-access/%s/%s/daily/%s/%s" % (project,user_agent,page,start,end)
    data = requests.get(base_url).json()['items']
    df = pd.DataFrame(data) [['views','timestamp']]
    df.rename(columns={'views':page},inplace=True)
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
    df.set_index('timestamp',inplace=True)
    return df


def getViewsWithRedirects(page,start,end,project):
    """
    Get all redirects going to 'page' and get pageviews for that article
    page: str (article's title)
    start: str start date YYYYMMDD (20200101)
    end: str end date YYYYMMDD (20200103)
    project: str, ex: en.wikipedia (project does not include .org)
    """
    #start list of page
    pages = [page]
    try:
        #get all redirects to page 
        redirects = getRedirects(page=page,project=project)     
    except:
        #if getRedirects gives an error, we assume that there no pages redirecting to page
        redirects = []
        pages.extend(redirects)
    ## get pages views for all articles
    results = getViewsMultiples(pages=pages,start=start,end=end,project=project)
        #sum all pages views
    results = pd.DataFrame(results.sum(axis=1))
    results.rename(columns={0:page},inplace=True)
    return results    



def getViewsMultiples(pages,start,end,project):
    """
    Get page views for a list of pages 
    pages: list (list of article's titles) ex: ['Chile','Brasil','Argentina']
    start: str start date YYYYMMDD (20200101)
    end: str end date YYYYMMDD (20200103)
    project: str, ex: en.wikipedia (project does not include .org)
    """
    results = []
    for page in pages:
        try:
            results.append(getViews(page,start,end,project))
        except:
            pass
    return pd.concat(results,axis=1)

def getViewsMultiplesWithRedirects(pages,start,end,project):
    """
    Get page views for a list of pages considering the redirects for each page
    pages: list (list of article's titles) ex: ['Chile','Brasil','Argentina']
    start: str start date YYYYMMDD (20200101)
    end: str end date YYYYMMDD (20200103)
    project: str, ex: en.wikipedia (project does not include .org)
    """
    results = []
    for page in pages:
        try:
            results.append(getViewsWithRedirects(page,start,end,project))
        except:
            pass
    return pd.concat(results,axis=1)

In [44]:
#Reading input data
import glob
languages = glob.glob('wik*clean*csv')
pages = {}
for lang in languages:
    langcode =lang[-10:-8]
    pages[langcode] = pd.read_csv(lang)


In [46]:
views = {}
for langcode,data in pages.items():
    print(langcode)
    page_list = data['page_title'].tolist()
    views[langcode] = getViewsMultiplesWithRedirects(pages=page_list ,start='20200101',end='20200825',project='%s.wikipedia' % langcode)
    views[langcode].to_csv('Views_nobots_%s.csv' % langcode)

zh


In [47]:
import pickle
with open('wikpedia_lists_views.pickle','wb') as f:
    pickle.dump(views,f)