# Load Modules

In [1]:
import os
os.chdir("../")

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import json
from functools import partial

In [3]:
from pyspark.sql.functions import udf, lit, col

In [4]:
# spark_session = SparkSession.builder.appName("Test").getOrCreate()
# spark_session = (
#     SparkSession.builder.appName("Python Spark SQL Hive integration example")
#     .config("spark.executor.cores", 8)
#     .config("spark.task.cpus", 8)
#     .config("spark.cores.max", 24)
#     .config("spark.driver.extraClassPath", "../jars/postgresql-42.3.3.jar")
#     .getOrCreate()
# )
# spark_session = (
#     SparkSession.builder.appName("Python Spark SQL Hive integration example")
#     .config("spark.executor.cores", 16)
#     .config("spark.task.cpus", 2)
#     .config("spark.cores.max", 24)
#     .config("spark.executor.memory", "8g")
#     .config("spark.executor.instance", 16)
#     .config("spark.driver.memory", "8g")
#     # .config("spark.driver.extraClassPath", "../jars/postgresql-42.3.3.jar")
#     .getOrCreate()
# )
spark_session = (
    SparkSession.builder.appName("Python Spark SQL Hive integration example")
    .config("spark.executor.memory", "8g")
    .config("spark.executor.instance", 4)
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)


In [5]:
import tempfile
import pathlib

In [6]:
if not os.path.exists("./tmp"):
    os.makedirs("./tmp")

tempfile.tempdir = "./tmp/"
dump_dir = tempfile.TemporaryDirectory()
    


In [7]:
# dump_dir = pathlib.Path("./tmp")

### Make Utils

In [8]:
from utils.json_utils import parse_json
from schemas.users import USERS_FIELD_DATA_SCHEMA
from schemas.transactions import TRANSACTIONS_FIELD_DATA_SCHEMA

In [9]:
from utils.health import measure_time

In [10]:
USERS_FIELD_DATA_SCHEMA

{'id': {'path': ['id'], 'dtype': 'string'},
 'user_last_activity': {'path': ['user_last_activity', '_seconds'],
  'dtype': 'int'},
 'user_signup_date': {'path': ['user_signup_date', '_seconds'],
  'dtype': 'int'},
 'rating': {'path': ['rating', 'stars'], 'dtype': 'string'}}

In [11]:
# def expand_fields(x, schema):
#     data = json.loads(x["data"])
#     data = parse_json(data, schema)
#     for i in schema:
#         x[i] = data[i]
#     return x

In [12]:
def jsonify_field(x, schema=None):
    data = json.loads(x)
    data = parse_json(data, schema)
    return data

In [13]:
users_data_field = partial(jsonify_field, schema=USERS_FIELD_DATA_SCHEMA)
transactions_data_field = partial(jsonify_field, schema=TRANSACTIONS_FIELD_DATA_SCHEMA)

# Extract

In [14]:
analytics = spark_session.read.csv("datasets/analytics.csv", quote="\"", escape="\"", header=True)
analytics.createOrReplaceTempView("analytics")

In [15]:
transactions = spark_session.read.csv("datasets/transactions.csv", quote="\"", escape="\"", header=True)
transactions.createOrReplaceTempView("transactions")

In [16]:
users = spark_session.read.csv("datasets/users.csv", quote="\"", escape="\"", header=True)
users.createOrReplaceTempView("users")

In [18]:
users.collect()[0]

Row(_c0='0', timestamp='2021-10-01 01:21:26.953627+00:00', event_id='cf0f4246-66b7-4fdd-b279-016ae5091545-0', document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2', operation='UPDATE', document_id='DrcOaVjp8FWTic6okQAYx2quxrU2', data='{"business_card": {"alternate_mobile_no": "03049445761", "business_name": "ALI HAIDER / FATHER\'S Home Business ", "coordinates": {"lat": 31.1108119, "lng": 72.80649559999999}, "location": "Chak # 45 GB,  Tehsil Samundri District FSD", "mobile_no": "03246820975", "name": "Ishtiaq Mehmood "}, "business_name": "ALI HAIDER / FATHER\'S Home Business ", "businesss_type": "General Store", "cashbook_current_balance": -80, "contextID": "d035dbe3-cee3-4e59-b964-4899b7d63a4d", "current_location": {"latitude": 31.1062476, "longitude": 72.7949253}, "fcm_token": "eNgv3-itRlm4FZZ901FtmY:APA91bG_HZObrNXbJJkn7ILDLlkLctTxW4eUHFxOILL42BX9Xb_uWxk5RXdHeSCNg7RXhf3drLAeVDIH4lqfOOwmUQz_8oprvPssvB0KiPXGPw7yDGww6hyOGJpQDyZFWimFc3l

In [46]:
users = users.withColumn("data", udf(users_data_field)(col("data")))
transactions = transactions.withColumn("data", udf(transactions_data_field)(col("data")))

In [48]:
transactions.show()

+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|_c0|           timestamp|            event_id|       document_name|operation|         document_id|                data|
+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|  0|2020-11-25 03:26:...|8f11bd01-d465-493...|projects/hisaab-7...|   CREATE|QgNGgxEvYNx6kmTdMM9o|{note=17.11, amou...|
|  1|2020-11-25 04:15:...|ad7c8ad8-e48c-443...|projects/hisaab-7...|   CREATE|HCJ1trNIpJmBGDiC5L3U|{note=Sabon, amou...|
|  2|2021-12-21 19:24:...|8a0b3966-b31b-434...|projects/hisaab-7...|   CREATE|a3878e83-ddea-45e...|{note=Jazz Monthl...|
|  3|2021-12-21 19:37:...|d84c9101-0bcf-49a...|projects/hisaab-7...|   CREATE|29101f24-6a35-42c...|{note=, amount=40...|
|  4|2020-11-24 13:29:...|93a5ae97-85b1-4aa...|projects/hisaab-7...|   CREATE|cCgOCm4wT5FxfpWblRhh|{note=Dr sent, am...|
|  5|2020-11-24 14:06:...|9c1a5d

In [64]:
spark_session.sql("select * from users join transactions on users.event_id=transactions.event_id").show()

+---+---------+--------+-------------+---------+-----------+----+---+---------+--------+-------------+---------+-----------+----+
|_c0|timestamp|event_id|document_name|operation|document_id|data|_c0|timestamp|event_id|document_name|operation|document_id|data|
+---+---------+--------+-------------+---------+-----------+----+---+---------+--------+-------------+---------+-----------+----+
+---+---------+--------+-------------+---------+-----------+----+---+---------+--------+-------------+---------+-----------+----+



In [57]:
users.filter(users.document_id.contains("QgNGgxEvYNx6kmTdMM9o")).show()

+---+---------+--------+-------------+---------+-----------+----+
|_c0|timestamp|event_id|document_name|operation|document_id|data|
+---+---------+--------+-------------+---------+-----------+----+
+---+---------+--------+-------------+---------+-----------+----+



In [73]:
users.show(3)

+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|_c0|           timestamp|            event_id|       document_name|operation|         document_id|                data|
+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|  0|2021-10-01 01:21:...|cf0f4246-66b7-4fd...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=4, user_l...|
|  1|2021-08-13 15:21:...|2ccb8df6-fe72-4bc...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=4, user_l...|
|  2|2021-07-03 15:38:...|4c4cdb6e-aee8-46a...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=4, user_l...|
+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
only showing top 3 rows



In [81]:
users_schema = {"_c0": str, "timestamp": str, "event_id": str, "document_name": str, "operation": str, "document_id": str, "data": {"id": str, "user_la"}}

In [77]:
spark_session.createDataFrame(users.toPandas(), schema=users_schema).show()

+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|_c0|           timestamp|            event_id|       document_name|operation|         document_id|                data|
+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|  0|2021-10-01 01:21:...|cf0f4246-66b7-4fd...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=4, user_l...|
|  1|2021-08-13 15:21:...|2ccb8df6-fe72-4bc...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=4, user_l...|
|  2|2021-07-03 15:38:...|4c4cdb6e-aee8-46a...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=4, user_l...|
|  3|2020-12-13 06:45:...|79ed83bd-e2f6-412...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=null, use...|
|  4|2021-08-11 17:38:...|c53137da-9c78-483...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{rating=4, user_l...|
|  5|2020-11-30 15:42:...|bfd039

In [65]:
analytics.show()

+---+----------+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------+-----------+
|_c0|event_date|     event_timestamp|          event_name|             user_id|      user_pseudo_id|       device_model|android_os|device_language|city_geoIp|app_version|
+---+----------+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------+-----------+
|  0|2021-09-01|2021-09-01 04:37:...|       session_start|229cb688-c853-461...|2f74ce381d6e3fa4e...|      mobile S1 Pro| Android 9|          en-us|   Karachi|     2.26.0|
|  1|2021-09-01|2021-09-01 12:52:...|   Click_GivePayment|3bddd824-6deb-42a...|dfa278c52ad12e690...|mobile Reno5 Pro 5G|Android 11|          en-au|   Karachi|     2.26.0|
|  2|2021-09-01|2021-08-31 19:29:...|Click_Recievepayment|3bddd824-6deb-42a...|dfa278c52ad12e690...|mobile Reno5 Pro 5G|Android 11|          en-a

### Apply transformation

In [19]:
users = users.apply(users_data_field, axis=1).drop(columns=["data"])

In [20]:
transactions = transactions.apply(transactions_data_field, axis=1).drop(columns=["data"])

In [21]:
users = users.to_spark()

In [None]:
users.collect()

# Transform

In [None]:
users.head()

In [None]:
transactions.head()

In [None]:
analytics.head()

In [None]:
all_user_ids = analytics.to_pandas().user_pseudo_id

In [None]:
"ba5e4473-d825-44d3-872b-78ff3061ad8d" in all_user_ids

### Merge

In [None]:
users = users.to_spark()

In [None]:
transactions = transactions.to_spark()

In [None]:
users.createTempView("users")

In [None]:
spark_session.sql("select event_id from users limit 10").collect()

In [None]:
users.select("event_id").collect()

## Tests