# Load Modules

In [1]:
import os
os.chdir("../")

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import json
from functools import partial

In [3]:
from pyspark.sql.functions import udf, lit, col

In [4]:
# spark_session = SparkSession.builder.appName("Test").getOrCreate()
# spark_session = (
#     SparkSession.builder.appName("Python Spark SQL Hive integration example")
#     .config("spark.executor.cores", 8)
#     .config("spark.task.cpus", 8)
#     .config("spark.cores.max", 24)
#     .config("spark.driver.extraClassPath", "../jars/postgresql-42.3.3.jar")
#     .getOrCreate()
# )
# spark_session = (
#     SparkSession.builder.appName("Python Spark SQL Hive integration example")
#     .config("spark.executor.cores", 16)
#     .config("spark.task.cpus", 2)
#     .config("spark.cores.max", 24)
#     .config("spark.executor.memory", "8g")
#     .config("spark.executor.instance", 16)
#     .config("spark.driver.memory", "8g")
#     # .config("spark.driver.extraClassPath", "../jars/postgresql-42.3.3.jar")
#     .getOrCreate()
# )
spark_session = (
    SparkSession.builder.appName("Python Spark SQL Hive integration example")
    .config("spark.executor.memory", "8g")
    .config("spark.executor.instance", 4)
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)


In [5]:
import tempfile
import pathlib

In [6]:
if not os.path.exists("./tmp"):
    os.makedirs("./tmp")

tempfile.tempdir = "./tmp/"
dump_dir = tempfile.TemporaryDirectory()
    


In [7]:
# dump_dir = pathlib.Path("./tmp")

### Make Utils

In [8]:
from utils.json_utils import parse_json
from schemas.users import USERS_FIELD_DATA_SCHEMA
from schemas.transactions import TRANSACTIONS_FIELD_DATA_SCHEMA

In [9]:
from utils.health import measure_time

In [21]:
# def expand_fields(x, schema):
#     data = json.loads(x["data"])
#     data = parse_json(data, schema)
#     for i in schema:
#         x[i] = data[i]
#     return x

In [30]:
def expand_fields(x, schema=None):
    data = json.loads(x)
    data = parse_json(data, schema)
    return data

In [31]:
users_data_field = partial(expand_fields, schema=USERS_FIELD_DATA_SCHEMA)
transactions_data_field = partial(expand_fields, schema=TRANSACTIONS_FIELD_DATA_SCHEMA)

# Extract

In [12]:
analytics = spark_session.read.csv("datasets/analytics.csv", quote="\"", escape="\"", header=True)

In [13]:
transactions = spark_session.read.csv("datasets/transactions.csv", quote="\"", escape="\"", header=True)

In [14]:
users = spark_session.read.csv("datasets/users.csv", quote="\"", escape="\"", header=True)

In [15]:
users.select("document_id").distinct().count()

50

In [16]:
users.select("document_name").collect()[:10]

[Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 Row(document_name='projects/hisaab-7e8b4/databases/(default)/documents/_users/DrcOaVjp8FWTic6okQAYx2quxrU2'),
 

In [72]:
def increase_val(x):
    x = json.loads(x)
    return x
    return x.get("business_card"), x.get("business_card")

In [32]:
users.withColumn("new", udf(users_data_field)(col("data"))).show()

+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+
|_c0|           timestamp|            event_id|       document_name|operation|         document_id|                data|                 new|
+---+--------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+
|  0|2021-10-01 01:21:...|cf0f4246-66b7-4fd...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{"business_card":...|{rating=4, user_l...|
|  1|2021-08-13 15:21:...|2ccb8df6-fe72-4bc...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{"business_card":...|{rating=4, user_l...|
|  2|2021-07-03 15:38:...|4c4cdb6e-aee8-46a...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{"business_card":...|{rating=4, user_l...|
|  3|2020-12-13 06:45:...|79ed83bd-e2f6-412...|projects/hisaab-7...|   UPDATE|DrcOaVjp8FWTic6ok...|{"business_name":...|{rating=null, use...|
|  4|2

In [33]:
# reduce(lambda x, y: x.withColumn(y, lit(.4)), [users, "a", "b", "c"]).show()

In [22]:
# users = users.to_pandas_on_spark()

# analytics = analytics.to_pandas_on_spark()

# transactions = transactions.to_pandas_on_spark()

### Apply transformation

In [19]:
users = users.apply(users_data_field, axis=1).drop(columns=["data"])

In [20]:
transactions = transactions.apply(transactions_data_field, axis=1).drop(columns=["data"])

In [21]:
users = users.to_spark()

In [22]:
users.select("document_id").write.format("parquet").save(dump_dir.name  + "users133")

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/mamun/miniconda3/envs/cbook/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/mamun/miniconda3/envs/cbook/lib/python3.9/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/mamun/miniconda3/envs/cbook/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
users.collect()

# Transform

In [None]:
users.head()

In [None]:
transactions.head()

In [None]:
analytics.head()

In [None]:
all_user_ids = analytics.to_pandas().user_pseudo_id

In [None]:
"ba5e4473-d825-44d3-872b-78ff3061ad8d" in all_user_ids

### Merge

In [None]:
users = users.to_spark()

In [None]:
transactions = transactions.to_spark()

In [None]:
users.createTempView("users")

In [None]:
spark_session.sql("select event_id from users limit 10").collect()

In [None]:
users.select("event_id").collect()

## Tests