In [29]:
from datetime import datetime

from pyspark.sql import SQLContext
from pyspark.sql.functions import lit, col, udf
from pyspark.sql import types

In [100]:
sqlCtx = SQLContext(spark)

In [57]:
users = spark.read.parquet("spark_jobs/output/users-2020-10-16/")
step6 = spark.read.parquet("spark_jobs/output/step6-2020-10-16")

In [58]:
users.createOrReplaceTempView("users")
step6.createOrReplaceTempView("step6")

In [59]:
func =  udf(lambda x: datetime.strptime(x, '%Y-%m-%d'), types.DateType())

In [60]:
users = users.withColumn('date', func(col('date')))

In [61]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- device: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- channel: string (nullable = true)



In [62]:
print(users.head())
print(step6.head())

Row(user_id=450007, date=datetime.date(2015, 2, 28), device='Desktop', sex='Female', channel='search')
Row(user_id=123100, page='step6')


In [63]:
step6_fact = spark.sql("""
SELECT users.user_id FROM
users, step6
WHERE users.user_id = step6.user_id
""")

In [64]:
step6_fact = step6_fact.withColumn("visited", lit(1))

In [65]:
step6_fact.createOrReplaceTempView("step6_fact")

### Visit trends by month

In [98]:
spark.sql("""
SELECT SUM(step6_fact.visited) AS visits, users.device, users.sex, users.channel, MONTH(users.date) AS month
FROM step6_fact
INNER JOIN users
ON step6_fact.user_id = users.user_id
AND channel = 'search'
GROUP BY users.device, users.sex, users.channel, month
ORDER BY visits DESC
""").show()

+------+-------+------+-------+-----+
|visits| device|   sex|channel|month|
+------+-------+------+-------+-----+
|    30| Mobile|  Male| search|    2|
|    26| Mobile|Female| search|    2|
|    24| Mobile|Female| search|    1|
|    19| Mobile|  Male| search|    1|
|    14|Desktop|Female| search|    1|
|    12|Desktop|  Male| search|    2|
|    11| Mobile|Female| search|    3|
|    11|Desktop|  Male| search|    1|
|     9|Desktop|  Male| search|    4|
|     8|Desktop|Female| search|    2|
|     4| Mobile|  Male| search|    3|
|     4|Desktop|Female| search|    3|
|     4| Mobile|  Male| search|    4|
|     2|Desktop|Female| search|    4|
|     2|Desktop|  Male| search|    3|
|     1| Mobile|Female| search|    4|
+------+-------+------+-------+-----+



In [103]:
step6_fact.write.format("jdbc").options(
        url="jdbc:sqlite:/Users/aleemr/powerhouse/interviews/jerry-coding-challenge/data/test.sqlite",
        driver="org.sqlite.JDBC",
        dbtable="step6_fact"
    ).save()