<h2> Imports & Configuration </h2>

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import time

In [2]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [3]:
import cml.data_v1 as cmldata

# Sample in-code customization of spark configurations
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.cores', '1')
#SparkContext.setSystemProperty('spark.executor.memory', '2g')

CONNECTION_NAME = "go01-aw-dl"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Setting spark.hadoop.yarn.resourcemanager.principal to pauldefusco


Spark Application Id:spark-d127af18deb249469b673141f483edd8


In [4]:
# spark.conf.set("spark.sql.shuffle.partitions", "3")
spark.conf.set("spark.sql.adaptive.enabled", "false")

# Join Skews

In [5]:
STORAGE = "s3a://go01-demo/datalake/pdefusco"

#s3a://go01-demo/datalake/pdefusco/transactions/

transactions_file = "/transactions/*"
customer_file = "/pii/piiData.csv"

df_transactions = spark.read.parquet(STORAGE + transactions_file)
df_customers = spark.read.option("header",True).csv(STORAGE + customer_file)

                                                                                

In [6]:
print(df_transactions.count())
print(df_customers.count())

                                                                                

1100000




200000


                                                                                

In [7]:
df_transactions.printSchema()
df_transactions.show(5, False)

root
 |-- credit_card_number: string (nullable = true)
 |-- credit_card_provider: string (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- event_ts: timestamp (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- transaction_currency: string (nullable = true)
 |-- transaction_amount: decimal(10,0) (nullable = true)

+------------------+--------------------+----------------+-------------------+---------+--------+--------------------+------------------+
|credit_card_number|credit_card_provider|transaction_type|event_ts           |longitude|latitude|transaction_currency|transaction_amount|
+------------------+--------------------+----------------+-------------------+---------+--------+--------------------+------------------+
|12345678901234    |JCB 16 digit        |purchase        |2023-10-18 23:03:00|-79.0    |43.3963 |EUR                 |23913             |
|12345678901234    |American Express    |purchase        |202

                                                                                

In [8]:
df_customers.printSchema()
df_customers.show(5, False)

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- email: string (nullable = true)
 |-- aba_routing: string (nullable = true)
 |-- bank_country: string (nullable = true)
 |-- account_no: string (nullable = true)
 |-- int_account_no: string (nullable = true)
 |-- swift11: string (nullable = true)
 |-- credit_card_number: string (nullable = true)

+----------------+-----------------+----------------------+-----------+------------+------------------+----------------------+-----------+------------------+
|name            |address          |email                 |aba_routing|bank_country|account_no        |int_account_no        |swift11    |credit_card_number|
+----------------+-----------------+----------------------+-----------+------------+------------------+----------------------+-----------+------------------+
|Alejandra Hall  |6596 Evans Points|null                  |null       |null        |null              |null                  |null       |null  

In [9]:
(
    df_transactions
    .groupBy("credit_card_number")
    .agg(F.countDistinct("credit_card_number").alias("ccn"))
    .orderBy(F.desc("ccn"))
    .show(5, False)
)



+------------------+---+
|credit_card_number|ccn|
+------------------+---+
|2045971357        |1  |
|2045971362        |1  |
|2045971531        |1  |
|2045971643        |1  |
|2045971765        |1  |
+------------------+---+
only showing top 5 rows



                                                                                

In [10]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [11]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="credit_card_number",
        how="inner"
    )
)

In [12]:
start_time = time.time()
df_txn_details.count()
print(f"time taken: {time.time() - start_time}")



time taken: 7.09859561920166


                                                                                

# Using AQE

In [13]:
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewedJoin.enabled", "true")

In [15]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="credit_card_number",
        how="inner"
    )
)

In [16]:
start_time = time.time()
df_txn_details.count()
print(f"time taken: {time.time() - start_time}")

[Stage 18:>                                                         (0 + 1) / 1]

time taken: 7.173134803771973


                                                                                

# Using Broadcast Joins

In [17]:
# 10MB = 10485760 Bytes
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 10485760)

In [18]:
df_txn_details = (
    df_transactions.join(
        F.broadcast(df_customers),
        on="credit_card_number",
        how="inner"
    )
)

In [19]:
start_time = time.time()
df_txn_details.count()
print(f"time taken: {time.time() - start_time}")



time taken: 6.47882866859436


                                                                                

In [18]:
# spark.stop()