<a href="https://colab.research.google.com/github/martinpius/PG_training/blob/main/spark_intro_ST7203_Big_Data_Management.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession


In [2]:
# Step 1: Create a SparkSession
# SparkSession is the entry point to Spark functionality. It is used to initialize Spark.
spark = SparkSession.builder \
    .appName("Spark Activities Example") \
    .getOrCreate()


In [3]:
# Step 2: Create a toy dataset
# Let's create a list of tuples representing (Name, Age, Gender, Salary)
data = [
    ("Alice", 30, "Female", 70000),
    ("Bob", 35, "Male", 80000),
    ("Charlie", 25, "Male", 50000),
    ("Diana", 29, "Female", 60000),
    ("Edward", 40, "Male", 100000)
]


In [4]:

# Step 3: Create an RDD
# An RDD (Resilient Distributed Dataset) is the core abstraction of Spark for distributed data processing.
rdd = spark.sparkContext.parallelize(data)


In [5]:
# Step 4: Apply transformations and actions to the RDD
# Transformations return a new RDD, and actions compute a result.

# Example 1: Count the number of elements in the RDD
print("Number of records in the RDD:", rdd.count())


Number of records in the RDD: 5


In [6]:

# Example 2: Filter records where age > 30
age_filtered_rdd = rdd.filter(lambda x: x[1] > 30)
print("Filtered RDD where Age > 30:")
print(age_filtered_rdd.collect())  # collect() returns all elements of the RDD to the driver


Filtered RDD where Age > 30:
[('Bob', 35, 'Male', 80000), ('Edward', 40, 'Male', 100000)]


In [7]:
# Example 3: Map Transformation - Extract just the names
names_rdd = rdd.map(lambda x: x[0])
print("Names in the dataset:")
print(names_rdd.collect())


Names in the dataset:
['Alice', 'Bob', 'Charlie', 'Diana', 'Edward']


In [8]:
# Example 4: Reduce - Calculate the total salary
total_salary = rdd.map(lambda x: x[3]).reduce(lambda a, b: a + b)
print("Total salary of all employees:", total_salary)


Total salary of all employees: 360000


In [9]:
# -------------------------------------------
# Part 2: Convert the RDD to a DataFrame and use Spark SQL
# -------------------------------------------

# Step 1: Convert RDD to a DataFrame
# We need to define column names to structure the DataFrame
columns = ["Name", "Age", "Gender", "Salary"]
df = spark.createDataFrame(rdd, schema=columns)


In [10]:
# Step 2: Show the DataFrame
# Show the first 5 rows of the DataFrame
print("DataFrame:")
df.show()


DataFrame:
+-------+---+------+------+
|   Name|Age|Gender|Salary|
+-------+---+------+------+
|  Alice| 30|Female| 70000|
|    Bob| 35|  Male| 80000|
|Charlie| 25|  Male| 50000|
|  Diana| 29|Female| 60000|
| Edward| 40|  Male|100000|
+-------+---+------+------+



In [11]:

# Step 3: Create a temporary SQL view
# This allows us to run SQL queries directly on the DataFrame
df.createOrReplaceTempView("employees")


In [12]:
# Step 4: Use SQL queries to analyze the data
print("Employees with Age > 30 (using SQL):")
spark.sql("SELECT Name, Age FROM employees WHERE Age > 30").show()

print("Average Salary (using SQL):")
spark.sql("SELECT AVG(Salary) as AverageSalary FROM employees").show()

print("Count of Employees by Gender (using SQL):")
spark.sql("SELECT Gender, COUNT(*) as Count FROM employees GROUP BY Gender").show()


Employees with Age > 30 (using SQL):
+------+---+
|  Name|Age|
+------+---+
|   Bob| 35|
|Edward| 40|
+------+---+

Average Salary (using SQL):
+-------------+
|AverageSalary|
+-------------+
|      72000.0|
+-------------+

Count of Employees by Gender (using SQL):
+------+-----+
|Gender|Count|
+------+-----+
|Female|    2|
|  Male|    3|
+------+-----+



In [13]:

# -------------------------------------------
# Part 3: MapReduce Example
# -------------------------------------------

# Example: Count the number of employees by gender using map-reduce in PySpark

# Step 1: Map each record to a (key, value) pair where key = gender and value = 1
gender_pairs = rdd.map(lambda x: (x[2], 1))



In [14]:
# Step 2: Use reduceByKey to count occurrences of each gender
gender_counts = gender_pairs.reduceByKey(lambda a, b: a + b)

print("Employee count by gender (using MapReduce):")
print(gender_counts.collect())

# -------------------------------------------
# Tasks for Students:
# -------------------------------------------



Employee count by gender (using MapReduce):
[('Female', 2), ('Male', 3)]


In [15]:
# Task 1: Find the maximum salary in the dataset using RDD transformations.
max_salary = rdd.map(lambda x: x[3]).reduce(lambda a, b: a if a > b else b)
print("Maximum Salary:", max_salary)



Maximum Salary: 100000


In [16]:
# Task 2: Find all employees whose salary is above the average salary using SQL.
average_salary = spark.sql("SELECT AVG(Salary) as avg_salary FROM employees").collect()[0]["avg_salary"]
high_earners = rdd.filter(lambda x: x[3] > average_salary)
print("Employees earning above the average salary:")
print(high_earners.collect())


Employees earning above the average salary:
[('Bob', 35, 'Male', 80000), ('Edward', 40, 'Male', 100000)]


In [17]:

# Task 3: Group employees by age range (e.g., 20-30, 31-40, etc.) and count them (students can use map-reduce or SQL).

# Step 1: Map each employee to an age range
age_range_rdd = rdd.map(lambda x: (f"{(x[1] // 10) * 10}-{(x[1] // 10) * 10 + 9}", 1))

# Step 2: Reduce by age range
age_range_counts = age_range_rdd.reduceByKey(lambda a, b: a + b)
print("Employee count by age range:")
print(age_range_counts.collect())



Employee count by age range:
[('20-29', 2), ('40-49', 1), ('30-39', 2)]


In [18]:
# -------------------------------------------
# Clean up
# -------------------------------------------
# Stop the SparkSession to free resources
spark.stop()