# Querying pageview-hourly table

https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_hourly

## Create spark session

In [1]:
import os, sys

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark

## defining the spark session
spark_config = {}
spark = wmfspark.get_session(
    app_name='Pyspark notebook | MGerlach (WMF)', 
    type='regular'
#     extra_settings=spark_config
)
spark

You are using wmfdata v1.3.2, but v1.3.3 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release --ignore-installed`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


## Query

In [2]:
df = (
    spark.read.table("wmf.pageview_hourly")
    # select time window
    .where(F.col("year")==2020)
    .where(F.col("month")==10)
    .where(F.col("day")==5)
    .where(F.col("hour")==2)
    
    # filter bots and automated traffic
    .where(F.col("agent_type") == "user")
    # only desktop and mobile web
    .where(F.col("access_method")!="mobile app")
    # only english wikipedia
    .where(F.col("project")=="en.wikipedia")
    # only namespace 0
    .where(F.col("namespace_id") == 0)
    # only valid pageids
    .where(F.col("page_id").isNotNull())
    # only country
    .where(F.col("country")=="Canada")
    # count access methods
    .groupBy("access_method")
    .count()
)
df.show(truncate=False)


+-------------+------+
|access_method|count |
+-------------+------+
|desktop      |278664|
|mobile web   |479680|
+-------------+------+

