In [1]:
# /home/labuser/Documents/Level2_Day2/

In [2]:
# Load the Lib
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, monotonically_increasing_id

In [3]:
# Initialize the Spark Session
spark = SparkSession.builder.appName("OptimizedJoinsInBanking").getOrCreate()

In [4]:
# Load the Datasets

df_account = spark.read.csv("/home/labuser/Documents/Level2_Day2/accounts.csv", header=True, inferSchema=True)
df_transactions = spark.read.csv("/home/labuser/Documents/Level2_Day2/transactions.csv", header=True, inferSchema=True)


# Show Schema 

df_account.printSchema()
df_transactions.printSchema()

root
 |-- account_id: string (nullable = true)
 |-- account_type: string (nullable = true)
 |-- balance: double (nullable = true)
 |-- customer_name: string (nullable = true)

root
 |-- transaction_id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [6]:
from pyspark.sql.functions import broadcast
df_optimized_join = df_transactions.join(broadcast(df_account), "account_id", "inner")
df_optimized_join.show()

#Why ?
Broadcasting the small dataset (accounts) avoids shuffling and speeds up the joins.

+----------+--------------+-------+----------------+----------+------------+--------+-------------+
|account_id|transaction_id| amount|transaction_type| timestamp|account_type| balance|customer_name|
+----------+--------------+-------+----------------+----------+------------+--------+-------------+
|  ACC_2878|         TXN_1|4785.79|          Credit|2024-06-06|    Checking|18350.21|Customer_2878|
|  ACC_1290|         TXN_2|3225.22|           Debit|2024-04-26|    Business| 4384.69|Customer_1290|
|  ACC_5816|         TXN_3| 925.49|           Debit|2024-01-30|    Business|15402.89|Customer_5816|
|  ACC_8174|         TXN_4|3362.12|          Credit|2024-06-12|     Savings|10835.49|Customer_8174|
|  ACC_5299|         TXN_5|2087.93|           Debit|2024-05-06|    Business|31829.53|Customer_5299|
|   ACC_711|         TXN_6|1984.28|          Credit|2024-10-31|     Savings|10962.96| Customer_711|
|  ACC_7214|         TXN_7|2609.88|          Credit|2024-10-18|     Savings|29965.25|Customer_7214|


In [9]:
# Skewed Join - Optimization(Salting)
from pyspark.sql.functions import lit

df_transactions2 = df_transactions.withColumn("salt", expr("floor(rand() * 5)"))
df_account2 = df_account.withColumn("salt", lit(0))

df_account2_salted = df_account2.union(df_account.withColumn("salt", lit(1))).union(df_account.withColumn("salt", lit(2))).union(df_account.withColumn("salt", lit(3))).union(df_account.withColumn("salt", lit(4)))

df_skewed_join = df_transactions2.join(df_account2_salted, ["account_id", "salt"], "inner")

df_skewed_join.show()

+----------+----+--------------+-------+----------------+----------+------------+--------+-------------+
|account_id|salt|transaction_id| amount|transaction_type| timestamp|account_type| balance|customer_name|
+----------+----+--------------+-------+----------------+----------+------------+--------+-------------+
|  ACC_2878|   0|         TXN_1|4785.79|          Credit|2024-06-06|    Checking|18350.21|Customer_2878|
|  ACC_1290|   1|         TXN_2|3225.22|           Debit|2024-04-26|    Business| 4384.69|Customer_1290|
|  ACC_5816|   2|         TXN_3| 925.49|           Debit|2024-01-30|    Business|15402.89|Customer_5816|
|  ACC_8174|   3|         TXN_4|3362.12|          Credit|2024-06-12|     Savings|10835.49|Customer_8174|
|  ACC_5299|   0|         TXN_5|2087.93|           Debit|2024-05-06|    Business|31829.53|Customer_5299|
|   ACC_711|   0|         TXN_6|1984.28|          Credit|2024-10-31|     Savings|10962.96| Customer_711|
|  ACC_7214|   2|         TXN_7|2609.88|          Credi

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, floor, rand, lit, monotonically_increasing_id

# Initialize Spark Session
spark = SparkSession.builder.appName("SkewedJoinOptimization").getOrCreate()

# 1️⃣ Sample Data - Banking Transactions (Skewed Data)
data_transactions = [
    (101, "John Doe", "Credit", 500),
    (102, "Jane Smith", "Debit", 200),
    (101, "John Doe", "Credit", 1000),  # Account 101 appears multiple times (skewed)
    (103, "Alice Brown", "Credit", 700),
    (101, "John Doe", "Debit", 300),  # Skewed Key
    (102, "Jane Smith", "Credit", 600),
]

data_accounts = [
    (101, "John Doe", "Savings"),
    (102, "Jane Smith", "Checking"),
    (103, "Alice Brown", "Savings"),
]

# 2️⃣ Creating DataFrames
columns_transactions = ["account_id", "customer_name", "transaction_type", "amount"]
df_transactions = spark.createDataFrame(data_transactions, columns_transactions)

columns_accounts = ["account_id", "customer_name", "account_type"]
df_accounts = spark.createDataFrame(data_accounts, columns_accounts)

# ✅ Adding Unique Transaction IDs using `monotonically_increasing_id`
df_transactions = df_transactions.withColumn("transaction_id", monotonically_increasing_id())

# ✅ Skewed Join Optimization (Salting)
# Add Salt Key to Transactions (random number between 0 and 4)
df_transactions = df_transactions.withColumn("salt", expr("floor(rand() * 5)"))

# Duplicate `df_accounts` 5 times, each with a different salt value
df_accounts_salted = df_accounts.union(df_accounts.withColumn("salt", lit(1)))\
                                 .union(df_accounts.withColumn("salt", lit(2)))\
                                 .union(df_accounts.withColumn("salt", lit(3)))\
                                 .union(df_accounts.withColumn("salt", lit(4)))

# ✅ Perform Skewed Join with Salting
df_skewed_join = df_transactions.join(df_accounts_salted, ["account_id", "salt"], "inner")

# ✅ Show Results
df_skewed_join.show()
