# Summary

filter sessions and write to txt-file on disc.

### Imports etc.

In [8]:
import os, sys
import numpy as np
import datetime
import calendar
import time
import pandas as pd

import wmfdata.spark as wmfspark

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window

In [9]:
spark_config = {}
spark_config = {}
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    extra_settings=spark_config
)
spark

## regular

# spark_config = {
#     "spark.driver.memory": "2g",
#     "spark.dynamicAllocation.maxExecutors": 64,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 256
# }
# ## big
# spark_config = {
#     "spark.driver.memory": "4g",
#     "spark.dynamicAllocation.maxExecutors": 128,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 512
# }


In [31]:
import requests

def blacklist_qids_to_pageids(blacklist_qids,wiki):
    '''
    transform a list of qids into pageids for a given wiki.
    <30!
    '''
    ## get the page-titles
    api_url_base = 'https://wikidata.org/w/api.php'
    params = {
                'action':'wbgetentities',
                'props':'sitelinks/urls',
                'languages':'en',
                'format' : 'json',
                'sitefilter':wiki,
                'ids':'|'.join(blacklist_qids),
            }
    blacklist_titles = []
    try:
        response = requests.get( api_url_base,params=params).json()
        if 'entities' in response:
            for qid, qid_dict in response['entities'].items():
                title = qid_dict.get('sitelinks',{}).get(wiki,{}).get('title','').replace(' ','_')
                if len(title)>0:
                    blacklist_titles += [title]
    except:
        pass
    blacklist_pageids = []
    if len(blacklist_titles)>0:

        ## get the page-ids
        api_url_base = 'https://%s.wikipedia.org/w/api.php'%( wiki.replace('wiki','') )
        params = {
            "action": "query",
            "titles": "|".join([str(h) for h in blacklist_titles]),
            "prop": "pageprops",
            "format": "json",
        }
    try:
        response = requests.get( api_url_base,params=params).json()
        if 'query' in response:
            if 'pages' in response['query']:
                for page, page_dict in response['query']['pages'].items():
                    blacklist_pageids += [int(page)]

    except:
        pass
    return blacklist_pageids


## defining filter and maps
def parse_requests(requests):
    """
    do some initial parsing:
    - drop pages without timestamp (we dont know which order)
    """
    requests_clean = []
    for r in requests:
        if r['ts'] == None:
            pass
        else:
            requests_clean += [r]
    return requests_clean

def filter_consecutive_articles(requests):
    """
    Looking at the data, there are a lot of
    sessions with the same article
    requested 2 times in a row. This
    does not make sense for training, so
    lets collapse them into 1 request
    """
    r = requests[0]
    t = r['page_title']
    clean_rs = [r,]
    prev_t = t
    for r in requests[1:]:
        t = r['page_title']
        if t == prev_t:
            continue
        else:
            clean_rs.append(r)
            prev_t = t
    return clean_rs

def filter_blacklist_pageid(requests):
    """
    If the session contains an article in the blacklist,
    drop the session. Currently, only the Main Page is
    in the black list
    """

    black_list = blacklist_pageids
    for r in requests:
        if r['page_id'] in black_list:
            return False
    return True
   

def sessionize(requests, dt = 3600):
    """
    Break request stream whenever
    there is a gap larger than dt [secs] in requests.
    default is 3600s=1hour [from Halfaker et al. 2015]
    """
    sessions = []
    session = [requests[0]]
    for r in requests[1:]:
        d = r['ts'] -  session[-1]['ts']
        if d > datetime.timedelta(seconds=dt):
            sessions.append(session)
            session = [r,]
        else:
            session.append(r)

    sessions.append(session)
    return sessions    


## Process data multiple days

In [32]:
# day_start = datetime.date(2020,4,1)
# day_end = datetime.date(2020,5,1)
# date_array = \
#     (day_start + datetime.timedelta(days=x) for x in range(0, (day_end-day_start).days))
# wiki = 'simplewiki'

day_start = datetime.date(2020,4,1)
day_end = datetime.date(2020,4,8)
date_array = \
    (day_start + datetime.timedelta(days=x) for x in range(0, (day_end-day_start).days))
wiki = 'enwiki'


blacklist_qids = ['Q5296',]
blacklist_pageids = blacklist_qids_to_pageids(blacklist_qids,wiki)
print(blacklist_pageids)
dt = 3600
nlen_min = 2
nlen_max = 30

[15580374]


In [33]:
PATH_OUT = '/home/mgerlach/REPOS/reader-embedding/output/sessions/'
for date_object in date_array:
    day_str = date_object.strftime("%Y-%m-%d")#str(datetime.date(year,month,day))
    print(day_str)
    filename= '/user/mgerlach/webrequest/sessions_filtered_%s_%s.parquet'%(wiki,day_str)
    filename_save = 'sessions-filtered-pageid_%s_%s_dt%s_nmin%s_nmax%s'%(wiki,day_str,dt,nlen_min,nlen_max)

    ## hdfs-storing
    base_dir_hdfs = '/user/mgerlach/sessions'
    output_hdfs_dir = os.path.join(base_dir_hdfs,filename_save)
    os.system('hadoop fs -rm -r %s'%output_hdfs_dir)
    ## local storing
    base_dir_local =  PATH_OUT
    output_local_dir_tmp = os.path.join(base_dir_local,'tmp',filename_save)
    output_local_file = os.path.join(base_dir_local,filename_save)

    ## load data
    requests = spark.read.load(filename).rdd.map(lambda x: x['session'])
    
    requests = requests.map(lambda rs: [r for r in rs if r['page_id'] != None and r['project'] == wiki])
    to_str = lambda x: ' '.join([str(e['page_id']) for e in x])

    (requests
     .map(parse_requests)
     .filter(filter_blacklist_pageid) ## remove main_page
     .filter(lambda x: len(x)>=nlen_min) ## only sessions with at least length nlen_min
     .map(filter_consecutive_articles) ## remove consecutive calls to same article
     .filter(lambda x: len(x)>nlen_min) ## only sessions with at least length nlen_min
     .flatMap(lambda x: sessionize(x, dt = dt)) ## break sessions if interevent time is too large
     .filter(lambda x: len(x)>=nlen_min) ## only sessions with at least length nlen_min
     .filter(lambda x: len(x)<=nlen_max) ## only sessions with at most length nlen_max
     .map(to_str) ## conctenate session as single string
     ## write to hdfs
     .saveAsTextFile(output_hdfs_dir,compressionCodecClass = "org.apache.hadoop.io.compress.GzipCodec")
    )

    ## copy to local (set of tmp-dirs)
    os.system('hadoop fs -copyToLocal %s %s'%(output_hdfs_dir,output_local_dir_tmp))
    ## concatenate and unzip into single file
    os.system('cat %s/* | gunzip > %s'%(output_local_dir_tmp,output_local_file))
    ## remove set of tmp-dirs
    os.system('rm -rf %s'%output_local_dir_tmp)
    ## remove hadoop data
    os.system('hadoop fs -rm -r %s'%output_hdfs_dir)

2020-04-01
2020-04-02
2020-04-03
2020-04-04
2020-04-05
2020-04-06
2020-04-07
