### Data Extraction - Wikipedia Views
##### a) Imports 
Let's start by importing some packages that we'll need...

In [None]:
import pandas as pd
import time
import sys
import pickle
import copy

# File utilities
from pathlib import Path
from gzip import GzipFile
from bz2file import BZ2File
import wget

# parsing
import re
from urllib.parse import unquote
from typing import Dict, Set, List, Tuple

Now we'll download the pageview files...

In [None]:
wiki = "simple"
rawdatadir = "" # "../rawdata/"
datadir = "" # "../data/"
wikidump = "https://dumps.wikimedia.org/" + wiki + "wiki/latest/"
filenames = {}
filenames['article'] = wiki + "wiki-latest-pages-articles.xml.bz2"

def download_pageviews(*,start_year, end_year, start_month=1, end_month=12):
    """ Download page view data from wiki dumps """
    views = "https://dumps.wikimedia.org/other/pageview_complete/monthly/"

    for y in range(start_year, end_year+1):
        year_str = str(y)
        for m in range(start_month, end_month+1):
            month_str = str((m)).zfill(2)
            print("\nDownloading views:", year_str + month_str)
            url = views + year_str + "/" + year_str + "-" + month_str + "/"
            filename = "pageviews-" + year_str + month_str + "-user.bz2"
            url += filename
            try:
                Path(rawdatadir + "/" + filename).resolve(strict=True)
                print ("View file already downloaded")
            except FileNotFoundError:
                wget.download(wikidump+filenames['article'], rawdatadir)
                wget.download(url, out=rawdatadir)
download_pageviews(start_year=2016,end_year = 2016, start_month=1,end_month=1)

Moving on to pageviews data, a well-formed line of data has 6 elements separated by a space:
1. Domain
2. Title
3. Page ID
4. Platform
5. Pageviews
6. Hourly counts

We don't need the hourly counts, but a quick explantion of the format... "P1R3" means 1 view in hour 16 ("P1") and and 3 views in hour 16 ("R3").
Unfortunately, the data is messy so we can't always rely on well-formed data. Here's a class which will parse a line of pageview data... Note that we need to double-decode to be able to de-code non-ASCII characters (as in the example above)

In [None]:
class ViewData:
    __slots__ = ("line","log","wikicode","title", "pageid", "views", "desktop", "mobile")
    
    def __init__(self, line, log = None):
        self.line = unquote(unquote(line))
        self.log = log
        self.title = "None"
        self.pageid = "-1"
        self.views = 0
        self.desktop = 0
        self.mobile = 0 
        self.parse()

    def __str__(self):
            return self.wikicode + ":" + self.title + "(" + self.pageid + ") D" + str(self.desktop) + " M" + str(self.mobile) 

    def log_error(self,code, output):
         if self.log:
              with open(self.log,"a") as logfile:
                   logfile.write(code + "\t" + output)  
    
    def parse(self): 
        self.wikicode = self.line[:2]
        split = self.line.split(" ")
        if len(split) > 5:
            if len(split) == 6:  # expected format 
                wiki, self.title, self.pageid, platform, views , _ = split
            else:
                views = split[-2]
                platform = split[-3]
                self.pageid = split[-4]
            try:
                self.views = int(views)
                self.desktop = self.views if platform == "desktop" else 0
                self.mobile = self.views if platform != "desktop" else 0
            except:
                self.log_error("L",self.line) 
        else:
             self.log_error("S",self.line)
                

Now, we'll validate that this parses some test cases as expected:

In [None]:
good_data = "en.wikipedia 2010_world_cup_matches 168079 desktop 2 B2"
missing_page_id = "en.wikipedia Algorithms null desktop 8 A2B1G1K1R1Z1[1"
html_encoded = "en.wikipedia %25C3%2596zel:Ara 41416740 desktop 4 P1R3"
non_article = "en.wikipedia Category:Belgian_wine 24687607 mobile-web 6 D2T2]1_1"
space_in_title = "en.wikipedia Deep belief network 41416740 desktop 1 C1"
missing_data = "en.wikipedia Concept_map"

print(ViewData(good_data))
print(ViewData(missing_page_id))
print(ViewData(html_encoded))
print(ViewData(non_article))
print(ViewData(space_in_title))
print(ViewData(missing_data))

We'll process the views data file. We'll only keep the data if it's a valid article (based on the pageid). This will throw away pages like the category page in the example above. We'll also check that the number of views is greater than zero. 

In [None]:
def process_pageviews(*, log = None, writemode = "a", pagefile = "pageviews.tsv.bz2", projectfile = "projectviews.tsv", 
                      wikis, start_year:int, end_year:int, start_month:int, end_month:int, rawdatadir:str, datadir:str):
    with BZ2File( datadir + pagefile, writemode) as outfile:
        with open(datadir + projectfile, writemode) as sumfile:
            for year in range(start_year, end_year+1):    
                for month in range(start_month, end_month+1):
                    infile_name = rawdatadir + "pageviews-" + str(year) + str((month)).zfill(2) + "-user.bz2"
                    with BZ2File(infile_name) as infile:
                        print("Processing views:", str(year) + str((month)).zfill(2))
                        if writemode == "w":
                            outfile.write("wiki\tyear\tmonth\tpageid\tdesktop\tmobile\n".encode())
                            sumfile.write("wiki\tyear\tmonth\tviews\n")
                        for wiki in wikis:
                            wikiviews = 0
                            with open(rawdatadir + wiki.split(".",1)[0] + 'wiki-title-pageid.pickle', 'rb') as handle:
                                data = pickle.load(handle)
                            pages = set(data.values())
                            infile.seek(0,0)  
                            for line in infile:
                                if line.decode().startswith(wiki):
                                    viewdata = ViewData(line,log) 
                                    if viewdata.views > 0:
                                        wikiviews += viewdata.views
                                        if viewdata.pageid:
                                            if viewdata.pageid in pages:
                                                key = viewdata.wikicode + "\t" + str(year) + "\t" + str(month) 
                                                views = viewdata.pageid + "\t" + str(viewdata.desktop) + "\t" + str(viewdata.mobile)
                                                outfile.write((key + "\t" + views + "\n").encode())
                            sumfile.write(wiki + "\t" + str(year) + "\t" + str(month) + "\t" + str(wikiviews) + "\n")                            

Run it

In [None]:
process_pageviews (writemode = "w",
                    wikis = ("en.wikipedia","fr.wikipedia","simple.wikipedia"),
                    start_year = 2016,
                    end_year = 2016,
                    start_month = 1,
                    end_month = 1,
                    rawdatadir = "../rawdata/",
                    datadir = "../data/")

views = pd.read_table(
        "../data/pageviews.tsv.bz2", 
        dtype = {"wiki":"str","year":"int16", "month":"int8","pageid":"int32",
                 "desktop":"int32", "mobile":"int32"},
        keep_default_na=False, 
        na_values=['_'],
        quoting = 3,
        iterator = True).get_chunk(100)
print(views.head())
pd.read_table("../data/projectviews.tsv")