In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand
import time

In [5]:
# Criar sessão Spark
spark = SparkSession.builder \
    .appName("BigData Scalability Test") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

In [6]:
# Generate a large DataFrame with 100 million records
num_rows = 100_000_000
num_partitions = 10  # Adjust this value to test scalability

print("🚀 Starting data generation...")
start_time = time.time()
df = spark.range(num_rows).withColumn("random_value", rand())  # Add a random column
df = df.repartition(num_partitions)  # Control parallelism by setting the number of partitions
df.cache()  # Cache the DataFrame in memory to optimize performance
print(f"✅ Data generated in {time.time() - start_time:.2f} seconds.")

# Perform a simple count operation
print("📊 Running count operation...")
start_time = time.time()
count = df.count()
print(f"✅ Count completed in {time.time() - start_time:.2f} seconds. Total: {count}")

# Perform an aggregation operation
print("📈 Running aggregation...")
start_time = time.time()
agg_result = df.groupBy().sum("random_value").collect()  # Summing all random values
print(f"✅ Aggregation completed in {time.time() - start_time:.2f} seconds.")

# Perform a filtering operation
print("🔍 Running filter operation...")
start_time = time.time()
filtered_df = df.filter(df.random_value > 0.5)  # Filter rows where random_value > 0.5
filtered_count = filtered_df.count()
print(f"✅ Filtering completed in {time.time() - start_time:.2f} seconds. Total filtered: {filtered_count}")

# Stop the Spark session
spark.stop()
print("🏁 Scalability test completed!")


🚀 Starting data generation...
✅ Data generated in 6.63 seconds.
📊 Running count operation...
✅ Count completed in 230.37 seconds. Total: 100000000
📈 Running aggregation...
✅ Aggregation completed in 5.98 seconds.
🔍 Running filter operation...
✅ Filtering completed in 8.55 seconds. Total filtered: 49990854
🏁 Scalability test completed!
