# Summary

get reading sessions for a day (all wikipedias)

What are the filtering steps
- all pageviews from wikipedias of a given day
- filter bots by agent_type = user
- filter app (only keep desktop, mobile web)
- filter all sessions involving edit-attempt
- filter all sessions with more than 100 pageviews (avoid bot-traffic)
- only pageviews to main_namespace
- join wikidata-items
- aggregate to each line a session in the dataframe
- save to parquet

### Imports etc.

In [1]:
import os, sys
import numpy as np
import datetime
import calendar
import time
import pandas as pd
import string
import random

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark


In [2]:
spark_config = {}
spark_config = {}
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    extra_settings=spark_config
)
spark

## regular

# spark_config = {
#     "spark.driver.memory": "2g",
#     "spark.dynamicAllocation.maxExecutors": 64,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 256
# }
# ## big
# spark_config = {
#     "spark.driver.memory": "4g",
#     "spark.dynamicAllocation.maxExecutors": 128,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 512
# }


## Load data

In [4]:
## WEBREQuEST
df_webreq = spark.read.table('wmf.webrequest')
# df_webreq.printSchema()

In [5]:
## for several days
day_start = datetime.date(2020,4,1)
day_end = datetime.date(2020,4,8)

date_array = \
    (day_start + datetime.timedelta(days=x) for x in range(0, (day_end-day_start).days))


w = Window.partitionBy(F.col('user_hash'), F.col('year'), F.col('month'), F.col('day'))
n_p_max = 100 
n_p_min = 1
wiki = 'enwiki'

for date_object in date_array:
    day_str = date_object.strftime("%Y-%m-%d")#str(datetime.date(year,month,day))
    year = date_object.year
    month = date_object.month
    day = date_object.day
    print(day_str)
    
    ##
    ## we hash the client-ip and the user-agent aka 'fingerprinting'
    ## we add a salt
    # salt for UA/IP hash
    salt = ''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(random.randint(8,16)))
    user_hash = F.sha2(F.concat(F.col('client_ip'),F.lit('-'),F.col('user_agent'),F.lit(salt)),512) ## only client and user


    t1 = time.time()
    df_agg = (
        df_webreq
        .where( F.col('normalized_host.project') == wiki[:-4])
        ## hash of user-id as new column
        .withColumn('user_hash',user_hash)

        ## select time partition    
        .where( F.col('year')==year )
        .where( F.col('month')==month )
        .where( F.col('day')==day )
#         .where( F.col('hour')==1 ) ## for testing reduce # partitions


        ## select wiki project
        .where( F.col('normalized_host.project_family') == "wikipedia" )

        ## agent-type user to filter spiders
        ## https://meta.wikimedia.org/wiki/Research:Page_view/Tags#Spider
        .where(F.col('agent_type') == "user")

        ## user: desktop/mobile/mobile app; isaac filters != mobile app
        .where(F.col('access_method') != "mobile app")

        ## not clear why; present in all cases I saw before.
        .where(F.col('webrequest_source') ==  'text')


        ## filter users who edited
        .where( 
            (F.col('is_pageview') == 1)| 
            (F.col('uri_query').contains('action=edit')) | 
            (F.col('uri_query').contains('action=visualeditor')) | 
            (F.col('uri_query').contains('&intestactions=edit&intestactionsdetail=full&uiprop=options'))
        )

        ##### mark edit attempts (is_pageview==0)
        .withColumn('edit_attempt', F.when(F.col('is_pageview')==0,1).otherwise(0) )
        .withColumn('edit_attempt_session', F.max(F.col('edit_attempt')).over(w) )
        .where(F.col('edit_attempt_session')==0)

        ## only requests marked as pageviews
        .where( F.col('is_pageview') == 1 )  

        ## number of pageview requests per user and day between n_p_min and n_p_max
        .withColumn('n_p_by_user', F.sum(F.col('is_pageview').cast("long")).over(w) )
        .where(F.col('n_p_by_user') >= n_p_min)
        .where(F.col('n_p_by_user') <= n_p_max)    

        ## only main namespace
        .where( F.col('namespace_id') == 0 )
                
        .groupby('user_hash')
        .agg(
             F.first(F.col('access_method')).alias('access_method'),
             F.first(F.col('geocoded_data')).alias('geocoded_data'),
             F.first(F.col('n_p_by_user')).alias('session_length'),
             F.array_sort(
                 F.collect_list(
                     F.struct(
                         F.col('ts'),
                         F.col('page_id'),
                         F.col('pageview_info.page_title').alias('page_title'),
                         F.concat(F.col('normalized_host.project'),F.lit('wiki')).alias('project'),
                     )
                 )
             ).alias('session')
         )
    )


    # clear salt so not accidentally retained
    salt = None
    filename_save = '/user/mgerlach/webrequest/sessions_filtered_%s_%s.parquet'%(wiki,day_str)
    df_agg.write.mode('overwrite').parquet(filename_save)
    t2 = time.time()
    print('done in [s]: %.2f'%(t2-t1))

2020-04-01
done in [s]: 758.05
2020-04-02
done in [s]: 693.12
2020-04-03
done in [s]: 680.85
2020-04-04
done in [s]: 717.71
2020-04-05
done in [s]: 927.17
2020-04-06
done in [s]: 1137.29
2020-04-07
done in [s]: 1076.73
