# Summary

filter sessions and write to txt-file on disc.

### Imports etc.

In [5]:
import os, sys
import numpy as np
import datetime
import calendar
import time
import pandas as pd
import wmfdata.spark as wmfspark

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window

In [6]:
spark_config = {}
spark_config = {}
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    extra_settings=spark_config
)
spark

## regular

# spark_config = {
#     "spark.driver.memory": "2g",
#     "spark.dynamicAllocation.maxExecutors": 64,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 256
# }
# ## big
# spark_config = {
#     "spark.driver.memory": "4g",
#     "spark.dynamicAllocation.maxExecutors": 128,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 512
# }


In [7]:
## defining filter and maps
def parse_requests(requests):
    """
    do some initial parsing:
    - drop pages without timestamp (we dont know which order)
    """
    requests_clean = []
    for r in requests:
        if r['ts'] == None:
            pass
        else:
            requests_clean += [r]
    return requests_clean

def filter_consecutive_articles(requests):
    """
    Looking at the data, there are a lot of
    sessions with the same article
    requested 2 times in a row. This
    does not make sense for training, so
    lets collapse them into 1 request
    """
    r = requests[0]
    t = r['page_title']
    clean_rs = [r,]
    prev_t = t
    for r in requests[1:]:
        t = r['page_title']
        if t == prev_t:
            continue
        else:
            clean_rs.append(r)
            prev_t = t
    return clean_rs

def filter_blacklist_qid(requests):
    """
    If the session contains an article in the blacklist,
    drop the session. Currently, only the Main Page is
    in the black list
    """

    black_list = set(['Q5296',])
    for r in requests:
        if r['qid'] in black_list:
            return False
    return True
   

def sessionize(requests, dt = 3600):
    """
    Break request stream whenever
    there is a gap larger than dt [secs] in requests.
    default is 3600s=1hour [from Halfaker et al. 2015]
    """
    sessions = []
    session = [requests[0]]
    for r in requests[1:]:
        d = r['ts'] -  session[-1]['ts']
        if d > datetime.timedelta(seconds=dt):
            sessions.append(session)
            session = [r,]
        else:
            session.append(r)

    sessions.append(session)
    return sessions    


## Process data multiple days

In [4]:
day_start = datetime.date(2020,4,27)
day_end = datetime.date(2020,5,4)
date_array = \
    (day_start + datetime.timedelta(days=x) for x in range(0, (day_end-day_start).days))

dt = 3600
nlen_min = 2
nlen_max = 30
lang = 'simplewiki'
# lang = 'wikidata'


PATH_OUT = '/home/mgerlach/REPOS/reader-embedding/output/sessions/'
for date_object in date_array:
    day_str = date_object.strftime("%Y-%m-%d")#str(datetime.date(year,month,day))
    print(day_str)
    filename= '/user/mgerlach/webrequest/sessions_filtered_qids_%s.parquet'%(day_str)
    filename_save = 'sessions-filtered-qid_%s_%s_dt%s_nmin%s_nmax%s'%(lang,day_str,dt,nlen_min,nlen_max)

    ## hdfs-storing
    base_dir_hdfs = '/user/mgerlach/sessions'
    output_hdfs_dir = os.path.join(base_dir_hdfs,filename_save)
    os.system('hadoop fs -rm -r %s'%output_hdfs_dir)
    ## local storing
    base_dir_local =  PATH_OUT
    output_local_dir_tmp = os.path.join(base_dir_local,'tmp',filename_save)
    output_local_file = os.path.join(base_dir_local,filename_save)

    ## load data
    requests = spark.read.load(filename).rdd.map(lambda x: x['session'])
    
    ## keep only pageviews from a language
    if lang == 'wikidata':
        requests = requests.map(lambda rs: [r for r in rs if r['qid'] != None])
        to_str = lambda x: ' '.join([str(e['qid']) for e in x])
    else:
        requests = requests.map(lambda rs: [r for r in rs if r['qid'] != None and r['project'] == lang])
        to_str = lambda x: ' '.join([str(e['qid']) for e in x])

    (requests
     .map(parse_requests)
     .filter(filter_blacklist_qid) ## remove main_page
     .filter(lambda x: len(x)>=nlen_min) ## only sessions with at least length nlen_min
     .map(filter_consecutive_articles) ## remove consecutive calls to same article
     .filter(lambda x: len(x)>nlen_min) ## only sessions with at least length nlen_min
     .flatMap(lambda x: sessionize(x, dt = dt)) ## break sessions if interevent time is too large
     .filter(lambda x: len(x)>=nlen_min) ## only sessions with at least length nlen_min
     .filter(lambda x: len(x)<=nlen_max) ## only sessions with at most length nlen_max
     .map(to_str) ## conctenate session as single string
     ## write to hdfs
     .saveAsTextFile(output_hdfs_dir,compressionCodecClass = "org.apache.hadoop.io.compress.GzipCodec")
    )

    ## copy to local (set of tmp-dirs)
    os.system('hadoop fs -copyToLocal %s %s'%(output_hdfs_dir,output_local_dir_tmp))
    ## concatenate and unzip into single file
    os.system('cat %s/* | gunzip > %s'%(output_local_dir_tmp,output_local_file))
    ## remove set of tmp-dirs
    os.system('rm -rf %s'%output_local_dir_tmp)
    ## remove hadoop data
    os.system('hadoop fs -rm -r %s'%output_hdfs_dir)

2020-04-27
2020-04-28
2020-04-29
2020-04-30
2020-05-01
2020-05-02
2020-05-03


KeyboardInterrupt: 