<a href="https://colab.research.google.com/github/mahadi24t/data-analysis-fcc/blob/main/Copy_of_Data_Partitioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import random
import string

In [3]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Big Data Partitioning") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [4]:
# Function to generate random string
def random_string(length=10):
    return ''.join(random.choices(string.ascii_lowercase, k=length))

In [5]:
# Register UDF for random string generation
spark.udf.register("random_string", random_string)


In [6]:
# Generate synthetic big data (approx. 1M rows)
num_rows = 1000000
data = spark.range(num_rows) \
    .withColumn("name", F.expr("random_string(10)")) \
    .withColumn("age", F.rand() * 100) \
    .withColumn("salary", F.rand() * 100000) \
    .withColumn("department", F.element_at(
        F.array([F.lit(x) for x in ["HR", "IT", "Sales", "Marketing"]]),
        (F.floor(F.rand() * 4) + 1).cast("int")
    ))

In [None]:
# Show schema and sample data
print("Dataset Schema:")
data.printSchema()
print("\nSample Data:")
data.show(5, truncate=False)

Dataset Schema:
root
 |-- id: long (nullable = false)
 |-- name: string (nullable = true)
 |-- age: double (nullable = false)
 |-- salary: double (nullable = false)
 |-- department: string (nullable = true)


Sample Data:
+---+----------+------------------+------------------+----------+
|id |name      |age               |salary            |department|
+---+----------+------------------+------------------+----------+
|0  |oqinrszkkj|82.66041044378221 |71677.26679725945 |Marketing |
|1  |oxixzxngvu|52.62784283245882 |53947.905888449066|Sales     |
|2  |dyrtloekku|80.90121145892998 |73944.16977455956 |IT        |
|3  |bewlwtydfq|98.17214954246441 |55153.88050834428 |Marketing |
|4  |klofwpuhqz|16.931541431663145|41805.691521503475|Marketing |
+---+----------+------------------+------------------+----------+
only showing top 5 rows



In [None]:
# Configure partitioning
num_partitions = 10

In [None]:
# 1. Repartition by number of partitions
print("\n1. Repartitioning by number of partitions")
repartitioned_data = data.repartition(num_partitions)
print(f"Number of partitions: {repartitioned_data.rdd.getNumPartitions()}")


1. Repartitioning by number of partitions
Number of partitions: 10


In [None]:
# Save repartitioned data
repartitioned_data.write \
    .mode("overwrite") \
    .parquet("/content/repartitioned_data")

In [None]:
# 2. Partition by column (department)
print("\n2. Partitioning by department")
partitioned_by_dept = data.repartition(num_partitions, "department")
print(f"Number of partitions: {partitioned_by_dept.rdd.getNumPartitions()}")


2. Partitioning by department
Number of partitions: 10


In [None]:
# Save partitioned data by department
partitioned_by_dept.write \
    .mode("overwrite") \
    .partitionBy("department") \
    .parquet("/content/partitioned_by_dept")

In [None]:
# 3. Coalesce to reduce partitions
print("\n3. Coalescing to fewer partitions")
coalesced_data = data.coalesce(5)
print(f"Number of partitions after coalesce: {coalesced_data.rdd.getNumPartitions()}")


3. Coalescing to fewer partitions
Number of partitions after coalesce: 2


In [None]:
# Save coalesced data
coalesced_data.write \
    .mode("overwrite") \
    .parquet("/content/coalesced_data")


In [None]:
# Read back and verify partitioned data
print("\nReading partitioned data by department")
read_partitioned = spark.read.parquet("/content/partitioned_by_dept")
read_partitioned.groupBy("department").count().show()



Reading partitioned data by department
+----------+------+
|department| count|
+----------+------+
|        HR|250214|
|        IT|250122|
|     Sales|249982|
| Marketing|249682|
+----------+------+

