In [1]:
from datetime import datetime

from pyspark.sql import SQLContext
from pyspark.sql.functions import lit, col, udf
from pyspark.sql import types

In [2]:
spark = SparkSession.builder.appName('Funnel Analysis').config('spark.executor.cores','4').getOrCreate()

sqlCtx = SQLContext(spark)

### Load tables

In [53]:
users = spark.read.parquet("spark_jobs/datalake/users-2020-10-16/")
step4_fact = spark.read.parquet("spark_jobs/warehouse/step4_fact_table-2020-10-16/")
step5_fact = spark.read.parquet("spark_jobs/warehouse/step5_fact_table-2020-10-16/")
step6_fact = spark.read.parquet("spark_jobs/warehouse/step6_fact_table-2020-10-16/")

users.createOrReplaceTempView("users")
step4_fact.createOrReplaceTempView("step4_fact")
step5_fact.createOrReplaceTempView("step5_fact")
step6_fact.createOrReplaceTempView("step6_fact")

In [47]:
users.show(5)

+-------+----------+-------+------+--------+
|user_id|      date| device|   sex| channel|
+-------+----------+-------+------+--------+
| 450007|2015-02-28|Desktop|Female|  search|
| 756838|2015-01-13|Desktop|  Male|paid_ads|
| 568983|2015-04-09|Desktop|  Male|paid_ads|
| 190794|2015-02-18|Desktop|Female|  search|
| 537909|2015-01-15|Desktop|  Male|paid_ads|
+-------+----------+-------+------+--------+
only showing top 5 rows



In [48]:
step5_fact.show(5)

+-------+-------+
|user_id|visited|
+-------+-------+
| 838832|      1|
| 231324|      1|
|  13830|      1|
| 838723|      1|
| 205344|      1|
+-------+-------+
only showing top 5 rows



In [49]:
step6_fact.show(5)

+-------+-------+
|user_id|visited|
+-------+-------+
|  13830|      1|
| 559850|      1|
| 638114|      1|
| 581956|      1|
| 337704|      1|
+-------+-------+
only showing top 5 rows



## Page visit trend for step 6 by sex, channel and month

In [25]:
spark.sql(
    """
    SELECT SUM(visited) AS total_visits, sex, channel, MONTH(users.date) AS month
    FROM step6_fact
    INNER JOIN users
    ON step6_fact.user_id = users.user_id
    GROUP BY month, sex, channel
    ORDER BY month
    """
).show()

+------------+------+--------+-----+
|total_visits|   sex| channel|month|
+------------+------+--------+-----+
|          71|Female|paid_ads|    1|
|          50|  Male|paid_ads|    1|
|          30|  Male|  search|    1|
|          38|Female|  search|    1|
|          45|  Male|paid_ads|    2|
|          34|Female|  search|    2|
|          52|Female|paid_ads|    2|
|          42|  Male|  search|    2|
|           6|  Male|  search|    3|
|          12|  Male|paid_ads|    3|
|          15|Female|  search|    3|
|          11|Female|paid_ads|    3|
|          17|Female|paid_ads|    4|
|           3|Female|  search|    4|
|          13|  Male|  search|    4|
|          13|  Male|paid_ads|    4|
+------------+------+--------+-----+



## Percentage drop-off from step 5 to step 6 by sex and channel

In [44]:
spark.sql(
    """
    SELECT SUM(step5_fact.visited) AS total_step5_visits, SUM(step6_fact.visited) AS total_step6_visits,
    users.sex, users.channel,
    (100 - (SUM(step6_fact.visited) / SUM(step5_fact.visited)) * 100) AS perc_dropoff
    FROM step5_fact
    LEFT JOIN step6_fact
    ON step5_fact.user_id = step6_fact.user_id
    INNER JOIN users
    ON step5_fact.user_id = users.user_id
    GROUP BY users.sex, users.channel
    """
).show()

+------------------+------------------+------+--------+-----------------+
|total_step5_visits|total_step6_visits|   sex| channel|     perc_dropoff|
+------------------+------------------+------+--------+-----------------+
|              1170|                91|  Male|  search|92.22222222222223|
|              1251|                90|Female|  search|92.80575539568345|
|              1849|               151|Female|paid_ads| 91.8334234721471|
|              1760|               120|  Male|paid_ads|93.18181818181819|
+------------------+------------------+------+--------+-----------------+



## Percentage drop-off from step 4 to step 5 by sex, channel and month

In [67]:
spark.sql(
    """
    SELECT SUM(step4_fact.visited) AS total_step4_visits, SUM(step5_fact.visited) AS total_step5_visits,
    users.sex, users.channel, MONTH(users.date) AS month,
    (100 - (SUM(step5_fact.visited) / SUM(step4_fact.visited)) * 100) AS perc_dropoff
    FROM step4_fact
    LEFT JOIN step5_fact
    ON step4_fact.user_id = step5_fact.user_id
    INNER JOIN users
    ON step4_fact.user_id = users.user_id
    GROUP BY users.sex, users.channel, month
    ORDER BY month ASC
    """
).show()

+------------------+------------------+------+--------+-----+-----------------+
|total_step4_visits|total_step5_visits|   sex| channel|month|     perc_dropoff|
+------------------+------------------+------+--------+-----+-----------------+
|              4169|               736|Female|paid_ads|    1|82.34588630366994|
|              2744|               507|Female|  search|    1|81.52332361516035|
|              4013|               697|  Male|paid_ads|    1|82.63144779466734|
|              2628|               450|  Male|  search|    1|82.87671232876713|
|              4132|               745|Female|paid_ads|    2|81.96999031945789|
|              2688|               490|Female|  search|    2|81.77083333333334|
|              2771|               494|  Male|  search|    2|82.17250090220136|
|              4096|               683|  Male|paid_ads|    2|    83.3251953125|
|              1695|               113|  Male|  search|    3|93.33333333333333|
|              2728|               199|F