In [None]:
# /home/labuser/Documents/Level3/Day2/transactions_partitions/*transactions.csv

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("CheckPartitions").getOrCreate()

In [None]:
# Load the CSV files from the partition
#transactions_df = spark.read.csv("/home/labuser/Documents/Level3/Day2/transactions_partitions/", header=True, inferSchema=True)
transactions_df = spark.read.option("header",True).option("inferSchema", True).csv("/home/labuser/Documents/Level3/Day2/transactions_partitions/")

In [None]:
# Check the number of partitions
num_partiotions_before = transactions_df.rdd.getNumPartitions()
print(f"Number of partitions before optimization: {num_partiotions_before}")

# Get partition size
partition_sizes_before = transactions_df.rdd.glom().map(len).collect()
print(f"Partition sizes before optimization: {partition_sizes_before}")

In [None]:
# After Applying Repartition based on "amount"
optimized_df = transactions_df.repartition(100, "amount")

In [None]:
num_partitions_after = optimized_df.rdd.getNumPartitions()
partition_sizes_after = optimized_df.rdd.glom().map(len).collect()

print(f"Number of partitions after optimization: {num_partitions_after}")
print("Partition sizes after optimization:", partition_sizes_after)

In [None]:
spark

In [None]:
from pyspark import StorageLevel

In [None]:
# RDD
rdd = spark.sparkContext.parallelize(range(0, 100), numSlices=5)
rdd.setName("rdd")

In [None]:
rdd.persist(StorageLevel.MEMORY_ONLY)
print("RDD Count:", rdd.count())

In [None]:
# DF
data = [(1, "Manoj"), (2, "Mannu"), (2, "Ayushi")]
df = spark.createDataFrame(data, ["count", "name"])

In [None]:
df.persist(StorageLevel.DISK_ONLY)
print("DataFrame count:", df.count())

In [None]:
df.createOrReplaceGlobalTempView("df")

result = spark.sql("SELECT name, SUM(count) AS total_count FROM global_temp.df GROUP BY name")

result.show()

In [None]:
blacklisted_df = spark.read.csv("/home/labuser/Documents/Level3/Day2/blacklisted_accounts.csv", header=True, inferSchema=True)


In [None]:
# transactions_df and blacklisted_df

fraud_df = transactions_df.join(blacklisted_df, "customer_id", "inner")
fraud_df.show()

In [None]:
from pyspark.sql.functions import broadcast

fraud_df = transactions_df.join(broadcast(blacklisted_df), "customer_id", "inner") # (<100MB)
fraud_df.show()

In [None]:
from pyspark.sql import SparkSession
import time
spark = SparkSession.builder \
    .appName("WithSerialization") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()


In [None]:

# Initialize Spark session
spark = SparkSession.builder.appName("WithoutSerialization").getOrCreate()

# Create a simple banking transactions dataset
data = [(1, "Alice", 1000.0), (2, "Bob", 2000.0), (3, "Charlie", 500.0)]
df = spark.createDataFrame(data, ["customer_id", "name", "balance"])

# Measure execution time
start_time = time.time()

# Perform a simple transformation
df = df.withColumnRenamed("balance", "account_balance")
df.show()

end_time = time.time()
print("Execution Time (Without Serialization):", round(end_time - start_time, 4), "seconds")

# Stop Spark session
#spark.stop()


In [None]:
spark.stop()

In [1]:
from pyspark.sql import SparkSession
import time

# Initialize Spark session
spark = SparkSession.builder.appName("WithoutSerialization").getOrCreate()

# Create a simple banking transactions dataset
data = [(1, "Alice", 1000.0), (2, "Bob", 2000.0), (3, "Charlie", 500.0)]
df = spark.createDataFrame(data, ["customer_id", "name", "balance"])

# Measure execution time
start_time = time.time()

# Perform a simple transformation
df = df.withColumnRenamed("balance", "account_balance")
df.show()

end_time = time.time()
print("Execution Time (Without Serialization):", round(end_time - start_time, 4), "seconds")

# Stop Spark session
spark.stop()


+-----------+-------+---------------+
|customer_id|   name|account_balance|
+-----------+-------+---------------+
|          1|  Alice|         1000.0|
|          2|    Bob|         2000.0|
|          3|Charlie|          500.0|
+-----------+-------+---------------+

Execution Time (Without Serialization): 3.2292 seconds


In [2]:
from pyspark.sql import SparkSession
import time

# Initialize Spark session with Kryo Serialization
spark = SparkSession.builder \
    .appName("WithSerialization") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# Create a simple banking transactions dataset
data = [(1, "Alice", 1000.0), (2, "Bob", 2000.0), (3, "Charlie", 500.0)]
df = spark.createDataFrame(data, ["customer_id", "name", "balance"])

# Measure execution time
start_time = time.time()

# Perform the same transformation
df = df.withColumnRenamed("balance", "account_balance")
df.show()

end_time = time.time()
print("Execution Time (With Serialization):", round(end_time - start_time, 4), "seconds")

# Stop Spark session
spark.stop()


+-----------+-------+---------------+
|customer_id|   name|account_balance|
+-----------+-------+---------------+
|          1|  Alice|         1000.0|
|          2|    Bob|         2000.0|
|          3|Charlie|          500.0|
+-----------+-------+---------------+

Execution Time (With Serialization): 1.1459 seconds
