In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BotCampus PySpark Practice") \
    .master("local[*]") \
    .getOrCreate()


In [None]:
data = [
    ("Anjali", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25),
    ("Arjun", "Mumbai", 30)
]

columns = ["name", "city", "age"]

df = spark.createDataFrame(data, schema=columns)
df.show()
df.printSchema()


+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)



In [None]:
rdd = df.rdd
print(type(rdd))
print(df.collect())



<class 'pyspark.rdd.RDD'>
[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]


In [None]:
mapped_rdd = df.rdd.map(lambda row: f"{row.name} lives in {row.city} and is {row.age} years old")
for line in mapped_rdd.collect():
    print(line)

Anjali lives in Bangalore and is 24 years old
Ravi lives in Hyderabad and is 28 years old
Kavya lives in Delhi and is 22 years old
Meena lives in Chennai and is 25 years old
Arjun lives in Mumbai and is 30 years old


Module 2: RDDs & Transformations

Scenario: You received app feedback from users in free-text.

feedback = spark.sparkContext.parallelize([

"Ravi from Bangalore loved the delivery",

"Meena from Hyderabad had a late order",

"Ajay from Pune liked the service",

"Anjali from Delhi faced UI issues",

"Rohit from Mumbai gave positive feedback"
])

Tasks:

Split each line into words ( flatMap ).

Remove stop words ( from , the , etc.).

Count each word frequency using reduceByKey .

Find top 3 most frequent non-stop words.

In [None]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the delivery",
    "Meena from Hyderabad had a late order",
    "Ajay from Pune liked the service",
    "Anjali from Delhi faced UI issues",
    "Rohit from Mumbai gave positive feedback"
])


In [None]:
# Step 1: Tokenize
words = feedback.flatMap(lambda line: line.lower().split())
words.collect()

['ravi',
 'from',
 'bangalore',
 'loved',
 'the',
 'delivery',
 'meena',
 'from',
 'hyderabad',
 'had',
 'a',
 'late',
 'order',
 'ajay',
 'from',
 'pune',
 'liked',
 'the',
 'service',
 'anjali',
 'from',
 'delhi',
 'faced',
 'ui',
 'issues',
 'rohit',
 'from',
 'mumbai',
 'gave',
 'positive',
 'feedback']

In [None]:

# Step 2: Remove stop words
stop_words = {'from', 'the', 'a', 'had', 'and', 'is', 'in', 'of', 'to'}
filtered_words = words.filter(lambda word: word not in stop_words)
filtered_words.collect()

['ravi',
 'bangalore',
 'loved',
 'delivery',
 'meena',
 'hyderabad',
 'late',
 'order',
 'ajay',
 'pune',
 'liked',
 'service',
 'anjali',
 'delhi',
 'faced',
 'ui',
 'issues',
 'rohit',
 'mumbai',
 'gave',
 'positive',
 'feedback']

In [None]:

# Step 3: Word count
word_counts = filtered_words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)
word_counts.collect()

[('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1),
 ('gave', 1)]

In [None]:

# Step 4: Top 3 most frequent words
top_3_words = word_counts.takeOrdered(3, key=lambda x: -x[1])

# Print result
for word, count in top_3_words:
    print(f"{word}: {count}")

loved: 1
liked: 1
service: 1


Tasks:
Join both DataFrames on name .

Create a new column: attendance_rate = days_present / 25 .

Grade students using when :

A: >90, B: 80–90, C: <80.

Filter students with good grades but poor attendance (<80%).

In [None]:
# Students Data
students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]

# Attendance Data
attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
columns2 = ["name", "days_present"]


In [None]:
students_df = spark.createDataFrame(students, columns)
attendance_df = spark.createDataFrame(attendance, columns2)


In [None]:
joined_df = students_df.join(attendance_df, on="name")
joined_df.show()


+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



In [None]:
from pyspark.sql.functions import col

joined_df = joined_df.withColumn("attendance_rate", col("days_present") / 25)
joined_df.show()

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
|Anjali|   10-A|   78|          20|            0.8|
| Kavya|   10-B|   92|          22|           0.88|
| Rohit|   10-B|   85|          25|            1.0|
| Sneha|   10-C|   80|          19|           0.76|
+------+-------+-----+------------+---------------+



In [None]:
from pyspark.sql.functions import round, when, col

graded_df = joined_df.withColumn("grade", when(col("marks") > 90, "A")
                                 .when((col("marks") <= 90) & (col("marks") >= 80), "B")
                                 .otherwise("C"))

graded_df = graded_df.withColumn("attendance_rate", round(col("attendance_rate") * 100, 2))  # in %
graded_df.show()

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           96.0|    B|
|Anjali|   10-A|   78|          20|           80.0|    C|
| Kavya|   10-B|   92|          22|           88.0|    A|
| Rohit|   10-B|   85|          25|          100.0|    B|
| Sneha|   10-C|   80|          19|           76.0|    B|
+------+-------+-----+------------+---------------+-----+



In [None]:
filtered_df = graded_df.filter(
    (col("grade").isin("A", "B")) & (col("attendance_rate") < 80)
)
filtered_df.show()


+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           76.0|    B|
+-----+-------+-----+------------+---------------+-----+



Module 4: Ingest CSV & JSON, Save to Parquet
Tasks:
1. Ingest CSV:
emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000
2. Ingest JSON:
{
"id": 201,
"name": "Nandini",
"contact": {
"email": "nandi@example.com",
"city": "Hyderabad"
},
"skills": ["Python", "Spark", "SQL"]
}
Tasks:
Read both formats into DataFrames.

Flatten nested JSON using select , col , alias , explode .
Save both as Parquet files partitioned by city.

In [None]:
csv_data = """emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000
"""

with open("employees.csv", "w") as f:
    f.write(csv_data)


In [None]:
json_data = '''
{
  "id": 201,
  "name": "Nandini",
  "contact": {
    "email": "nandi@example.com",
    "city": "Hyderabad"
  },
  "skills": ["Python", "Spark", "SQL"]
}
'''

with open("employee.json", "w") as f:
    f.write(json_data)


In [None]:
emp_df = spark.read.csv("employees.csv", header=True, inferSchema=True)
emp_df.show()


+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [None]:
json_df = spark.read.json("employee.json", multiLine=True)
json_df.printSchema()
json_df.show(truncate=False)


root
 |-- contact: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+------------------------------+---+-------+--------------------+
|contact                       |id |name   |skills              |
+------------------------------+---+-------+--------------------+
|{Hyderabad, nandi@example.com}|201|Nandini|[Python, Spark, SQL]|
+------------------------------+---+-------+--------------------+



In [None]:
from pyspark.sql.functions import col, explode
flat_json_df = json_df.select(
    col("id"),
    col("name"),
    col("contact.email").alias("email"),
    col("contact.city").alias("city"),
    explode(col("skills")).alias("skill")
)
flat_json_df.show()


+---+-------+-----------------+---------+------+
| id|   name|            email|     city| skill|
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad| Spark|
|201|Nandini|nandi@example.com|Hyderabad|   SQL|
+---+-------+-----------------+---------+------+



In [None]:
emp_df.write.mode("overwrite").partitionBy("city").parquet("parquet_output/employees")
emp_df.show()

+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [None]:
flat_json_df.write.mode("overwrite").partitionBy("city").parquet("parquet_output/json_employees")
flat_json_df.show()

+---+-------+-----------------+---------+------+
| id|   name|            email|     city| skill|
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad| Spark|
|201|Nandini|nandi@example.com|Hyderabad|   SQL|
+---+-------+-----------------+---------+------+



Module 5: Spark SQL with Temp Views

Tasks:

Register the students DataFrame as students_view .

Write and run the following queries:

-- a) Average marks per section

-- b) Top scorer in each section

-- c) Count of students in each grade category

-- d) Students with marks above class average

-- e) Attendance-adjusted performance

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, round

spark = SparkSession.builder \
    .appName("BotCampus PySpark Practice") \
    .master("local[*]") \
    .getOrCreate()

students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]

attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
columns2 = ["name", "days_present"]

students_df = spark.createDataFrame(students, columns)
attendance_df = spark.createDataFrame(attendance, columns2)


In [None]:
joined_df = students_df.join(attendance_df, "name")
joined_df = joined_df.withColumn("attendance_rate", round(col("days_present") / 25 * 100, 2))
graded_df = joined_df.withColumn("grade", when(col("marks") > 90, "A")
                                  .when((col("marks") <= 90) & (col("marks") >= 80), "B")
                                  .otherwise("C"))
graded_df.show()

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           96.0|    B|
|Anjali|   10-A|   78|          20|           80.0|    C|
| Kavya|   10-B|   92|          22|           88.0|    A|
| Rohit|   10-B|   85|          25|          100.0|    B|
| Sneha|   10-C|   80|          19|           76.0|    B|
+------+-------+-----+------------+---------------+-----+



In [None]:
graded_df.createOrReplaceTempView("students_view")


-- a) Average marks per section

-- b) Top scorer in each section

-- c) Count of students in each grade category

-- d) Students with marks above class average

-- e) Attendance-adjusted performance

In [None]:
spark.sql("SELECT section, AVG(marks) AS avg_marks FROM students_view GROUP BY section").show()

+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-C|     80.0|
|   10-A|     83.5|
|   10-B|     88.5|
+-------+---------+



In [None]:
spark.sql("SELECT name, section, marks FROM students_view WHERE marks = (SELECT MAX(marks) FROM students_view)").show()

+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
|Kavya|   10-B|   92|
+-----+-------+-----+



In [None]:
spark.sql("SELECT grade, COUNT(*) AS count FROM students_view GROUP BY grade").show()

+-----+-----+
|grade|count|
+-----+-----+
|    B|    3|
|    C|    1|
|    A|    1|
+-----+-----+



In [None]:
spark.sql("SELECT name, section, marks FROM students_view WHERE marks > (SELECT AVG(marks) FROM students_view)").show()

+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
| Amit|   10-A|   89|
|Kavya|   10-B|   92|
|Rohit|   10-B|   85|
+-----+-------+-----+



In [None]:
spark.sql("SELECT name, section, attendance_rate FROM students_view WHERE attendance_rate > 80").show()

+-----+-------+---------------+
| name|section|attendance_rate|
+-----+-------+---------------+
| Amit|   10-A|           96.0|
|Kavya|   10-B|           88.0|
|Rohit|   10-B|          100.0|
+-----+-------+---------------+



Module 6: Partitioned Data & Incremental Loading

Step 1: Full Load

students_df.write.partitionBy("section").parquet("output/students/")

Step 2: Incremental Load

incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

Tasks:

List files in output/students/ using Python.

Read only partition 10-A and list students.

Compare before/after counts for section 10-A .

In [None]:
students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]

students_df = spark.createDataFrame(students, columns)

# Write partitioned by section
students_df.write.mode("overwrite").partitionBy("section").parquet("output/students/")


In [None]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])

df_inc.write.mode("append").partitionBy("section").parquet("output/students/")
df_inc.show()

+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
|Tejas|   10-A|   91|
+-----+-------+-----+



In [None]:
import os

base_path = "output/students/"
for root, dirs, files in os.walk(base_path):
    for name in files:
        print(os.path.join(root, name))


output/students/._SUCCESS.crc
output/students/_SUCCESS
output/students/section=10-A/part-00001-ba7ef944-4b5e-4f35-9881-fe71cf470024.c000.snappy.parquet
output/students/section=10-A/.part-00001-eea46714-9915-40e3-916a-162b56f20a6a.c000.snappy.parquet.crc
output/students/section=10-A/part-00000-ba7ef944-4b5e-4f35-9881-fe71cf470024.c000.snappy.parquet
output/students/section=10-A/.part-00001-ba7ef944-4b5e-4f35-9881-fe71cf470024.c000.snappy.parquet.crc
output/students/section=10-A/part-00001-eea46714-9915-40e3-916a-162b56f20a6a.c000.snappy.parquet
output/students/section=10-A/.part-00000-ba7ef944-4b5e-4f35-9881-fe71cf470024.c000.snappy.parquet.crc
output/students/section=10-B/part-00001-ba7ef944-4b5e-4f35-9881-fe71cf470024.c000.snappy.parquet
output/students/section=10-B/part-00000-ba7ef944-4b5e-4f35-9881-fe71cf470024.c000.snappy.parquet
output/students/section=10-B/.part-00001-ba7ef944-4b5e-4f35-9881-fe71cf470024.c000.snappy.parquet.crc
output/students/section=10-B/.part-00000-ba7ef944-4b

In [None]:
df_10a = spark.read.parquet("output/students/section=10-A")
df_10a.show()


+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



In [None]:
all_students_df = spark.read.parquet("output/students/")


In [None]:
count_10a = all_students_df.filter(col("section") == "10-A").count()
print(f"Total students in 10-A after incremental load: {count_10a}")


Total students in 10-A after incremental load: 3


Module 7: ETL Pipeline – End to End
Given Raw Data (CSV):
emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,
Tasks:
Load CSV with inferred schema.
Fill null bonuses with 2000 .
Create total_ctc = salary + bonus .
Filter employees with total_ctc > 65000 .
Save result in:

JSON format.
Parquet format partitioned by department.

In [None]:
csv_data = """emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,
"""

with open("employees_raw.csv", "w") as f:
    f.write(csv_data)


In [None]:
emp_df = spark.read.csv("employees_raw.csv", header=True, inferSchema=True)
emp_df.printSchema()
emp_df.show()


root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- bonus: integer (nullable = true)

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| NULL|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| NULL|
+------+------+-------+------+-----+



In [None]:
emp_df_filled = emp_df.fillna({"bonus": 2000})


In [None]:
from pyspark.sql.functions import col

emp_df_ctc = emp_df_filled.withColumn("total_ctc", col("salary") + col("bonus"))
emp_df_ctc.show()


+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



In [None]:
filtered_df = emp_df_ctc.filter(col("total_ctc") > 65000)
filtered_df.show()


+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



In [None]:
filtered_df.write.mode("overwrite").json("output/employees_json/")
filtered_df.write.mode("overwrite").partitionBy("dept").parquet("output/employees_parquet/")
