1. PySpark Setup & Initialization

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("BotCampus Intermediate Session") \
.master("local[*]") \
.getOrCreate()

In [None]:
data = [("Ananya", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



RDD and Transformations

In [None]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the mobile app",
"Meena from Delhi reported poor response time",
"Ajay from Pune liked the delivery speed",
"Ananya from Hyderabad had an issue with UI",
"Rohit from Mumbai gave positive feedback"
])

Tasks:

Count total number of words.

Find top 3 most common words.

Remove stop words ( from , with , the , etc.).

Create a dictionary of word → count.

In [None]:
no_of_words=feedback.flatMap(lambda x: x.split(" ")).count()
print(no_of_words)

35


In [None]:
top_comm=feedback.flatMap(lambda x: x.split(" ")).map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).takeOrdered(3, key = lambda x: -x[1])
print(top_comm)

[('from', 5), ('the', 2), ('loved', 1)]


In [None]:
from nltk.corpus import stopwords
import nltk
import string

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


filtered_word_counts = feedback.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).lower().split()).filter(lambda word: word not in stop_words).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

print("Word counts after removing stop words:", filtered_word_counts.collect())



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Word counts after removing stop words: [('loved', 1), ('app', 1), ('poor', 1), ('response', 1), ('liked', 1), ('speed', 1), ('ananya', 1), ('issue', 1), ('rohit', 1), ('mumbai', 1), ('positive', 1), ('feedback', 1), ('ravi', 1), ('bangalore', 1), ('mobile', 1), ('meena', 1), ('delhi', 1), ('reported', 1), ('time', 1), ('ajay', 1), ('pune', 1), ('delivery', 1), ('hyderabad', 1), ('ui', 1), ('gave', 1)]


In [None]:
word_count=filtered_word_counts.collectAsMap()


In [None]:
for word, count in word_count.items():
    print(f"{word}: {count}")

loved: 1
app: 1
poor: 1
response: 1
liked: 1
speed: 1
ananya: 1
issue: 1
rohit: 1
mumbai: 1
positive: 1
feedback: 1
ravi: 1
bangalore: 1
mobile: 1
meena: 1
delhi: 1
reported: 1
time: 1
ajay: 1
pune: 1
delivery: 1
hyderabad: 1
ui: 1
gave: 1


3. DataFrames – Transformations Tasks:
Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).

Group by subject, find average score.

Use when and otherwise to classify subject difficulty ( Math/Science =
Difficult).

Rank students per subject using Window function.

Apply UDF to format names (e.g., make all uppercase).

In [None]:
scores = [
("Ravi", "Math", 88),
("Ananya", "Science", 92),
("Kavya", "English", 79),
("Ravi", "English", 67),
("Neha", "Math", 94),

("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

1)Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).

In [None]:
from pyspark.sql import functions as F

df_scores = df_scores.withColumn("grade",
                                 F.when(df_scores.score >= 90, "A")
                                 .when(df_scores.score >= 80, "B")
                                 .when(df_scores.score >= 70, "C")
                                 .otherwise("D"))
df_scores.show()

+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



 Task 2: Group by subject, find average score

In [None]:

avg_scores = df_scores.groupBy("subject").agg(F.avg("score").alias("avg_score"))
avg_scores.show()



+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
|English|     73.0|
+-------+---------+



 Task 3: Use when and otherwise to classify subject difficulty

In [None]:

df_scores = df_scores.withColumn("difficulty",
                                 F.when(df_scores.subject.isin(["Math", "Science"]), "Difficult")
                                 .otherwise("Easy"))

df_scores.show()

+------+-------+-----+-----+----------+
|  name|subject|score|grade|difficulty|
+------+-------+-----+-----+----------+
|  Ravi|   Math|   88|    B| Difficult|
|Ananya|Science|   92|    A| Difficult|
| Kavya|English|   79|    C|      Easy|
|  Ravi|English|   67|    D|      Easy|
|  Neha|   Math|   94|    A| Difficult|
| Meena|Science|   85|    B| Difficult|
+------+-------+-----+-----+----------+



 Task 4: Rank students per subject using Window function


In [None]:
from pyspark.sql.window import Window

window = Window.partitionBy("subject").orderBy(F.col("score").desc())
df_scores = df_scores.withColumn("rank", F.rank().over(window))
df_scores.show()

+------+-------+-----+-----+----------+----+
|  name|subject|score|grade|difficulty|rank|
+------+-------+-----+-----+----------+----+
| Kavya|English|   79|    C|      Easy|   1|
|  Ravi|English|   67|    D|      Easy|   2|
|  Neha|   Math|   94|    A| Difficult|   1|
|  Ravi|   Math|   88|    B| Difficult|   2|
|Ananya|Science|   92|    A| Difficult|   1|
| Meena|Science|   85|    B| Difficult|   2|
+------+-------+-----+-----+----------+----+



 Task 5: Apply UDF to format names

In [None]:

format_name = F.udf(lambda x: x.upper())
df_scores = df_scores.withColumn("formatted_name", format_name(df_scores.name))

df_scores.show()


+------+-------+-----+-----+----------+----+--------------+
|  name|subject|score|grade|difficulty|rank|formatted_name|
+------+-------+-----+-----+----------+----+--------------+
| Kavya|English|   79|    C|      Easy|   1|         KAVYA|
|  Ravi|English|   67|    D|      Easy|   2|          RAVI|
|  Neha|   Math|   94|    A| Difficult|   1|          NEHA|
|  Ravi|   Math|   88|    B| Difficult|   2|          RAVI|
|Ananya|Science|   92|    A| Difficult|   1|        ANANYA|
| Meena|Science|   85|    B| Difficult|   2|         MEENA|
+------+-------+-----+-----+----------+----+--------------+



Tasks:
Load both datasets into PySpark.

Print schema and infer nested structure.

Flatten the JSON (use explode , select , alias ).

Convert both to Parquet and write to /tmp/output

In [38]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create a SparkSession
spark = SparkSession.builder.appName("Ingest CSV and JSON").getOrCreate()


In [41]:
# Load JSON file
df_json = spark.read.option("multiLine", "True").json("employee_nested.json")
df_json.printSchema()

# Check if the dataframe contains only the _corrupt_record column
if "_corrupt_record" in df_json.columns and len(df_json.columns) == 1:
    print("JSON file could not be parsed correctly. Showing corrupt records:")
    df_json.show(truncate=False)
else:
    df_json.show()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------+---+-----+---------------+
|         address| id| name|         skills|
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+



In [42]:
# Flatten the JSON
df_json_flattened = df_json.select("id", "name", F.col("address.city").alias("city"), F.col("address.pincode").alias("pincode"), F.explode("skills").alias("skill"))
df_json_flattened.printSchema()
df_json_flattened.show()


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- pincode: long (nullable = true)
 |-- skill: string (nullable = true)

+---+-----+------+-------+------+
| id| name|  city|pincode| skill|
+---+-----+------+-------+------+
|101|Sneha|Mumbai| 400001|Python|
|101|Sneha|Mumbai| 400001| Spark|
+---+-----+------+-------+------+



In [44]:
# Load CSV file
df_csv = spark.read.csv("students.csv", header=True, inferSchema=True)
df_csv.show()
df_csv.printSchema()

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)



5. Spark SQL – Temp Views & Queries

Exercise 5.1 Create view from exam scores and run:

-- a) Top scorer per subject

-- b) Count of students per grade

-- c) Students with multiple subjects

-- d) Subjects with average score above 85

Exercise 5.2 Create another DataFrame attendance(name, days_present) and:

Join with scores
Calculate attendance-adjusted grade:
If days_present < 20 → downgrade grade by one level

In [45]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = SparkSession.builder.appName("Spark SQL").getOrCreate()

scores = [
    ("Ravi", "Math", 88),
    ("Ananya", "Science", 92),
    ("Kavya", "English", 79),
    ("Ravi", "English", 67),
    ("Neha", "Math", 94),
    ("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)



In [46]:
# Create view
df_scores.createOrReplaceTempView("exam_scores")

# Exercise 5.1
# a) Top scorer per subject
spark.sql("""
    SELECT subject, name, score
    FROM (
        SELECT subject, name, score,
        ROW_NUMBER() OVER (PARTITION BY subject ORDER BY score DESC) AS rank
        FROM exam_scores
    ) AS subquery
    WHERE rank = 1
""").show()




+-------+------+-----+
|subject|  name|score|
+-------+------+-----+
|English| Kavya|   79|
|   Math|  Neha|   94|
|Science|Ananya|   92|
+-------+------+-----+



In [47]:
# b) Count of students per grade
df_scores = df_scores.withColumn("grade",
                                 F.when(df_scores.score >= 90, "A")
                                 .when(df_scores.score >= 80, "B")
                                 .when(df_scores.score >= 70, "C")
                                 .otherwise("D"))
df_scores.createOrReplaceTempView("exam_scores")
spark.sql("""
    SELECT grade, COUNT(*) AS count
    FROM exam_scores
    GROUP BY grade
""").show()


+-----+-----+
|grade|count|
+-----+-----+
|    B|    2|
|    C|    1|
|    A|    2|
|    D|    1|
+-----+-----+



In [48]:
# c) Students with multiple subjects
spark.sql("""
    SELECT name, COUNT(DISTINCT subject) AS subject_count
    FROM exam_scores
    GROUP BY name
    HAVING subject_count > 1
""").show()



+----+-------------+
|name|subject_count|
+----+-------------+
|Ravi|            2|
+----+-------------+



In [49]:
# d) Subjects with average score above 85
spark.sql("""
    SELECT subject, AVG(score) AS avg_score
    FROM exam_scores
    GROUP BY subject
    HAVING AVG(score) > 85
""").show()


+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
+-------+---------+



In [50]:
# Exercise 5.2
attendance = [
    ("Ravi", 25),
    ("Ananya", 18),
    ("Kavya", 22),
    ("Neha", 20),
    ("Meena", 15)
]
columns = ["name", "days_present"]
df_attendance = spark.createDataFrame(attendance, columns)

# Join with scores
df_joined = df_scores.join(df_attendance, "name")

# Calculate attendance-adjusted grade
df_joined = df_joined.withColumn("adjusted_grade",
                                  F.when(df_joined.days_present < 20, F.when(df_joined.grade == "A", "B")
                                         .when(df_joined.grade == "B", "C")
                                         .when(df_joined.grade == "C", "D")
                                         .otherwise("D"))
                                  .otherwise(df_joined.grade))

df_joined.show()


+------+-------+-----+-----+------------+--------------+
|  name|subject|score|grade|days_present|adjusted_grade|
+------+-------+-----+-----+------------+--------------+
|Ananya|Science|   92|    A|          18|             B|
| Kavya|English|   79|    C|          22|             C|
| Meena|Science|   85|    B|          15|             C|
|  Neha|   Math|   94|    A|          20|             A|
|  Ravi|   Math|   88|    B|          25|             B|
|  Ravi|English|   67|    D|          25|             D|
+------+-------+-----+-----+------------+--------------+



6. Partitioned Load (Full + Incremental)

Initial Load:

df_scores.write.partitionBy("subject").parquet("/tmp/scores/")

Incremental Load:

incremental = [("Meena", "Math", 93)]

df_inc = spark.createDataFrame(incremental, columns)

df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

Task:

List all folders inside /tmp/scores/

Read only Math partition and display all entries.

In [51]:
from pyspark.sql import SparkSession
import os

# Create a SparkSession
spark = SparkSession.builder.appName("Partitioned Load").getOrCreate()
scores = [
    ("Ravi", "Math", 88),
    ("Ananya", "Science", 92),
    ("Kavya", "English", 79),
    ("Ravi", "English", 67),
    ("Neha", "Math", 94),
    ("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)
df_scores.write.partitionBy("subject").parquet("/tmp/scores/")

In [52]:
# Incremental Load
incremental = [("Meena", "Math", 93)]
df_inc = spark.createDataFrame(incremental, columns)
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

In [53]:
# Task 1: List all folders inside /tmp/scores/
import subprocess
folders = subprocess.check_output(["ls", "/tmp/scores/"]).decode("utf-8").splitlines()
print(folders)

['subject=English', 'subject=Math', 'subject=Science', '_SUCCESS']


In [54]:
# Task 2: Read only Math partition and display all entries
df_math = spark.read.parquet("/tmp/scores/subject=Math")
df_math.show()

+-----+-----+
| name|score|
+-----+-----+
|Meena|   93|
| Neha|   94|
| Ravi|   88|
+-----+-----+



In [55]:
df_scores = spark.read.parquet("/tmp/scores/")
df_scores.filter(df_scores.subject == "Math").show()


+-----+-----+-------+
| name|score|subject|
+-----+-----+-------+
|Meena|   93|   Math|
| Neha|   94|   Math|
| Ravi|   88|   Math|
+-----+-----+-------+



7. ETL: Clean, Transform, Load
Raw CSV:

emp_id,name,dept,salary,bonus
1,Arjun,IT,78000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,55000,3000

Tasks:

Load data with header.

Fill missing bonus with 2000.

Calculate total_ctc = salary + bonus .

Filter where total_ctc > 60,000.

Save final DataFrame to Parquet and JSON.


In [57]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("ETL").getOrCreate()

df = spark.read.csv("emp_data.csv", header=True, inferSchema=True)

df = df.withColumn("bonus", F.coalesce(df.bonus, F.lit(2000)))


df = df.withColumn("total_ctc", df.salary + df.bonus)

df = df.filter(df.total_ctc > 60000)

df.write.parquet("/tmp/emp_data_parquet", mode="overwrite")
df.write.json("/tmp/emp_data_json", mode="overwrite")

df.show()


+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+

