# Summary

filter and sessionize the reading-sessions extracted from webrequest
- map to qid or pageid and remove pageviews that do not have an id
    - if lang=='wikidata': keep pageviews from all projects which have a qid
    - if lang=='*wiki' (e.g. enwiki): keep pageviews from a single project which have a pageid.
- keep pageview only if it has a timestamp
- collapse pageviews if same page was viewed consecutively
- remove sessions which contain the main-page
- split sessions of interevent time is larger than 1 hour
- keep only sessions with >= 2 and <= 30 pageviews

Output is a file on disk 
    - PATH_OUT/reading-sessions-filtered_<DAY>_<LANG>
        - <DAY> = 2020-05-01
        - <LANG> = wikidata, enwiki, dewiki, ...
    - this is a textfile where each line is a session and pageviews in the session are separated by whitespace
        - if <LANG> == 'wikidata' a succession of qids: e.g. Q123 Q2347523 Q23452354 \n
        - if <LANG> == '*wiki' a successoin of pageids: 212375 123621 12 123 \n
    
As input we need the parquet-files generated in reading-sessions_01-get-data-from-webrequest.
    
    
Required packages:
    - findspark
    - wmfdata

### Imports etc.

In [1]:
import os, sys
import datetime
import calendar
import time
import wmfdata.spark as wmfspark

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark

You are using wmfdata v1.0.1, but v1.0.2 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


In [2]:
spark_config = {}
# spark_config = {
#     "spark.driver.memory": "2g",
#     "spark.dynamicAllocation.maxExecutors": 64,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 256
# }
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    extra_settings=spark_config
)
spark

In [3]:
## defining filter and maps
def parse_requests(requests):
    """
    do some initial parsing:
    - drop pages without timestamp (we dont know which order)
    """
    requests_clean = []
    for r in requests:
        if r['ts'] == None:
            pass
        else:
            requests_clean += [r]
    return requests_clean

def filter_consecutive_articles(requests):
    """
    Looking at the data, there are a lot of
    sessions with the same article
    requested 2 times in a row. This
    does not make sense for training, so
    lets collapse them into 1 request.
    We compare page-ids and not:
    - page-titles due to redirects (page-title keeps the redirect-from, page-id resolves redirect)
    - qids to capture possible language switching.
    """
    r = requests[0]
    t = r['page_id']
    clean_rs = [r,]
    prev_t = t
    for r in requests[1:]:
        t = r['page_id']
        if t == prev_t:
            continue
        else:
            clean_rs.append(r)
            prev_t = t
    return clean_rs

def filter_blacklist_qid(requests):
    """
    If the session contains an article in the blacklist,
    drop the session. Currently, only the Main Page is
    in the black list
    """

    black_list = set(['Q5296',])
    for r in requests:
        if r['qid'] in black_list:
            return False
    return True
   

def sessionize(requests, dt = 3600):
    """
    Break request stream whenever
    there is a gap larger than dt [secs] in requests.
    default is 3600s=1hour [from Halfaker et al. 2015]
    """
    sessions = []
    session = [requests[0]]
    for r in requests[1:]:
        d = r['ts'] -  session[-1]['ts']
        if d > datetime.timedelta(seconds=dt):
            sessions.append(session)
            session = [r,]
        else:
            session.append(r)

    sessions.append(session)
    return sessions    


## Process data multiple days

In [4]:
## select time interval (daily)
day_start = datetime.date(2020,4,1)
day_end = datetime.date(2020,4,8)
date_array = \
    (day_start + datetime.timedelta(days=x) for x in range(0, (day_end-day_start).days))

## select language
# lang = 'wikidata' ## wikidata aggregates all wiki (qids)
lang = 'enwiki' ## individual language keeps only one project (pageids)



dt = 3600 ## cutoff for splitting sessions(interevent time between 2 pageivews)
nlen_min = 2 ## min length of session
nlen_max = 30 ## max length of session



## folder to the parquet-files from the extracted webrequest data
PATH_IN = '/user/mgerlach/webrequest/' 
## folder where to save the filtered reading sesssions
PATH_OUT = '/home/mgerlach/REPOS/reader-embedding/output/reading-sessions-filtered/'


for date_object in date_array:
    t1 = time.time()
    day_str = date_object.strftime("%Y-%m-%d")#str(datetime.date(year,month,day))
    print(lang, day_str)
    filename=os.path.join(PATH_IN,'reading-sessions-webrequest_%s.parquet'%(day_str))
    filename_save = 'reading-sessions-filtered_%s_%s'%(lang,day_str)

    ## hdfs-storing, some temporary files which will be deleted later
    base_dir_hdfs = '/user/mgerlach/sessions'
    output_hdfs_dir = os.path.join(base_dir_hdfs,filename_save)
    os.system('hadoop fs -rm -r %s'%output_hdfs_dir)
    ## local storing
    base_dir_local =  PATH_OUT
    output_local_dir_tmp = os.path.join(base_dir_local,'tmp',filename_save)
    output_local_file = os.path.join(base_dir_local,filename_save)

    ## load data
    requests = spark.read.load(filename).rdd.map(lambda x: x['session'])
    
    ## keep only pageviews from a language
    if lang == 'wikidata':
        requests = requests.map(lambda rs: [r for r in rs if r['qid'] != None])
        to_str = lambda x: ' '.join([str(e['qid']) for e in x])
    else:
        requests = requests.map(lambda rs: [r for r in rs if r['page_id'] != None and r['project'] == lang])
        to_str = lambda x: ' '.join([str(e['page_id']) for e in x])

    (requests
     .map(parse_requests)
     .filter(filter_blacklist_qid) ## remove main_page
     .filter(lambda x: len(x)>=nlen_min) ## only sessions with at least length nlen_min
     .map(filter_consecutive_articles) ## remove consecutive calls to same article
     .filter(lambda x: len(x)>nlen_min) ## only sessions with at least length nlen_min
     .flatMap(lambda x: sessionize(x, dt = dt)) ## break sessions if interevent time is too large
     .filter(lambda x: len(x)>=nlen_min) ## only sessions with at least length nlen_min
     .filter(lambda x: len(x)<=nlen_max) ## only sessions with at most length nlen_max
     .map(to_str) ## conctenate session as single string
     ## write to hdfs
     .saveAsTextFile(output_hdfs_dir,compressionCodecClass = "org.apache.hadoop.io.compress.GzipCodec")
    )

    ## copy to local (set of tmp-dirs)
    os.system('hadoop fs -copyToLocal %s %s'%(output_hdfs_dir,output_local_dir_tmp))
    ## concatenate and unzip into single file
    os.system('cat %s/* | gunzip > %s'%(output_local_dir_tmp,output_local_file))
    ## remove set of tmp-dirs
    os.system('rm -rf %s'%output_local_dir_tmp)
    ## remove hadoop data
    os.system('hadoop fs -rm -r %s'%output_hdfs_dir)
    
    t2 = time.time()
    print('done in [s]: %.2f'%(t2-t1))
print('finished')

2020-04-01
done in [s]: 168.04
2020-04-02
done in [s]: 168.87
2020-04-03
done in [s]: 154.69
2020-04-04
done in [s]: 145.23
2020-04-05
done in [s]: 158.93
2020-04-06
done in [s]: 154.14
2020-04-07
done in [s]: 133.10
finished
