In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
schema = StructType([StructField('id', IntegerType(),True),
                     StructField('name',StringType(),True),
                     StructField('Department',StringType(),True),
                     StructField('Salary',IntegerType(),True),
                     ])
data = [(1,'Mukil','IT',2000),(2,'Priyanga','IT',3000),(3,'Mani','HR',1000),(4,'Lekha','Finance',1400),(5,'Priya','Finance',3000)]
df = spark.createDataFrame(data, schema)

In [0]:
#Aggregation
from pyspark.sql import functions as F
df.groupBy('Department').agg({'Salary':'avg'}).alias('Avg_Salary').show() #Alias not working
df.groupBy('Department').agg(F.sum('Salary').alias('Total_Salary')).show()
'''
The number of employees
The employee with the highest salary in that department
'''
df.groupBy('Department').agg(F.count('id').alias('Emp_Count')).show()
df.groupBy('Department').agg(F.max('Salary').alias('Max_Salary')).show()



In [0]:
#Windowing
#Find the highest Salary per department
from pyspark.sql.window import Window
from pyspark.sql.functions import col
from pyspark.sql import functions as F
window_spec = Window.partitionBy('Department').orderBy(F.desc('Salary'))
df_ranked = df.withColumn('rank', F.rank().over(window_spec))
df_ranked.filter(col('rank') == 1).select(col('name'),col('Salary'),col('Department')).show()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("bonus", DoubleType(), True),
    StructField("city", StringType(), True)
])

data = [
    (1, "Alice",   "IT",       4000, 500.0,  "New York"),
    (2, "Bob",     "IT",       4500, 600.0,  "Chicago"),
    (3, "Charlie", "HR",       3000, 400.0,  "New York"),
    (4, "David",   "Finance",  3500, 200.0,  "Boston"),
    (5, "Eve",     "Finance",  3800, 250.0,  "Chicago"),
    (6, "Frank",   "HR",       3200, 300.0,  "Boston"),
    (7, "Grace",   "IT",       5000, 700.0,  "New York"),
    (8, "Helen",   "Finance",  4200, 350.0,  "Chicago")
]

df = spark.createDataFrame(data, schema)
df.show()


In [0]:
'''
Find the average salary and total bonus per department.
Find the top 2 highest-paid employees in each department using a window function.
Find the total salary per city and department combination.
Find employees who earn more than the average salary of their department
Add a new column showing each employee’s salary rank within their department.
Calculate total compensation (salary + bonus) and find the top 3 earners overall.
Pivot the DataFrame to show total salary for each department across cities.
Get the department that has the highest average bonus.
Find the difference between each employee’s salary and the maximum salary in their department.
Join this DataFrame with another (say, department head info) and find which head manages the highest total salary.
'''

df.groupBy('Department').agg(F.round(F.avg('salary'),2).alias('avg_sal'),F.sum('bonus').alias('sum_bonus')).show()

from pyspark.sql.window import Window
from pyspark.sql.functions import col
window_spec = Window.partitionBy('Department').orderBy(F.desc('salary'))
df_ranked = df.withColumn('ranked', F.dense_rank().over(window_spec))
df_ranked.filter(col('ranked')<=2).select(col('name'), col('Department'),col('salary')).show()

df.groupBy('city','Department').agg(F.sum('salary').alias('sum_city_dept')).show()

window_spec = Window.partitionBy('Department')
df_avg = df.withColumn('Avg_Salary', F.avg('salary').over(window_spec))
df_avg.filter(col('salary') > col('Avg_Salary')).select(col('name'),col('salary'),col('Avg_Salary')).show()

from pyspark.sql.window import Window
from pyspark.sql.functions import col
window_spec = Window.partitionBy('Department').orderBy(F.desc('salary'))
df.withColumn('ranked', F.dense_rank().over(window_spec)).show()

df.withColumn('bonus_salary', col('bonus')+col('salary')).orderBy(col('bonus_salary').desc()).show(3)

df.groupBy('Department').pivot('city').agg(F.sum('salary')).show()

df.groupBy('Department').agg(F.avg('bonus')).show(1)

win_spec = Window.partitionBy('Department')
df_max = df.withColumn('max_Salary', F.max('salary').over(win_spec))
df_max.withColumn('difference', col('max_Salary')-col('salary')).show()

df_joined = df.join(df_head, on='Department', how='inner')
df_joined = df_joined.groupBy('head_name').agg(F.sum('salary').alias('tot_head_sal'))
df_joined.show()
df_joined = df_joined.orderBy(col('tot_head_sal').desc()).show(1)



In [0]:
head_schema = StructType([
    StructField("Department", StringType(), True),
    StructField("head_name", StringType(), True)
])

head_data = [
    ("IT", "Mr. John"),
    ("HR", "Ms. Laura"),
    ("Finance", "Mr. Steve")
]

df_head = spark.createDataFrame(head_data, head_schema)
df_head.show()


In [0]:
#For each employee, calculate cumulative salary and cumulative bonus within their department, ordered by salary descending.
from pyspark.sql.window import Window
from pyspark.sql.functions import col
from pyspark.sql import functions as F
window_spec = Window.partitionBy('Department').orderBy(col('salary').desc())
df.withColumn('Cum_Sal', F.sum('salary').over(window_spec)).show()

In [0]:
#Rank employees first by department, then within city, based on total compensation (salary + bonus).
df_tot_comp = df.withColumn('total_compensation', col('salary')+col('bonus'))
winspec = Window.partitionBy('Department', 'city').orderBy(col('total_compensation').desc())
df_tot_comp.withColumn('ranked', F.dense_rank().over(winspec)).show()

In [0]:
7#For each department, compute the salary difference with the previous highest-paid employee.

win_spec = Window.partitionBy('Department').orderBy(col('salary').desc())
df_lead = df.withColumn('lagSal', F.lag('salary').over(win_spec))
df_lead.withColumn('sal_diff', col('lagSal')-col('salary')).show()


In [0]:
data = [
  {"id": 1, "name": "Alice", "city": "New York"},
  {"id": 2, "name": "Bob", "city": "Los Angeles"}
]
df = spark.createDataFrame(data)
df.show()


In [0]:
data = [
  {
    "id": 1,
    "name": "Alice",
    "address": {
      "city": "New York",
      "state": "NY"
    }
  },
  {
    "id": 2,
    "name": "Bob",
    "address": {
      "city": "Los Angeles",
      "state": "CA"
    }
  }
]


df = spark.createDataFrame(data)
df.show(truncate=False)

df.select(
    'id',
    'name',
    col('address.city').alias('address_city'),
    col('address.state').alias('address_state')
).show()


In [0]:
data = [
  {
    "id": 1,
    "name": "Alice",
    "phoneNumbers": [
      {"type": "home", "number": "555-1234"},
      {"type": "mobile", "number": "555-5678"}
    ]
  },
  {
    "id": 2,
    "name": "Bob",
    "phoneNumbers": [
      {"type": "home", "number": "555-8765"}
    ]
  }
]
from pyspark.sql.functions import explode
df = spark.createDataFrame(data)
df.show(truncate=False)
df_new = df.select(
    'id',
    'name',
    explode('phoneNumbers').alias('exploded_phoneNumber')
)
df_new = df_new.select(
    'id',
    'name',
    col('exploded_phoneNumber.type').alias('phonenum_type'),
    col('exploded_phoneNumber.number').alias('phonenum_number')
)
df_new.show()
df_new.write.format('csv')


In [0]:
"""
Write logic to classify employees:
“High” → total_comp > 100000
“Medium” → 50000 ≤ total_comp ≤ 100000
“Low” → below 50000
"""
from pyspark.sql.functions import col, when
df_tot = df.withColumn('total_comp', col('salary')+col('bonus'))
df_tot.withColumn('emp_class', when(col('total_comp')> 5000, 'High').
                               when((col('total_comp')<= 5000) & (col('total_comp')>= 4000), 'Medium').otherwise('Low')).show()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

# ----------------------
# Create DataFrame
# ----------------------
data = [
    (
        1,
        100,
        '[{"sku":"A1","qty":2,"price":10.0},{"sku":"B2","qty":1,"price":5.5}]',
        "2025-10-01"
    ),
    (
        2,
        101,
        '[{"sku":"A1","qty":1,"price":10.0}]',
        "2025-10-02"
    )
]

df = spark.createDataFrame(data, ["order_id", "customer_id", "items_json", "order_date"])
df.show(truncate=False)


In [0]:
item_schema = ArrayType(
    StructType([
        StructField('sku', StringType()),
        StructField('qty', IntegerType()),
        StructField('price', DoubleType())
    ])
)

df_parsed = df.withColumn('items', from_json(col('items_json'), item_schema))
df_parsed = df_parsed.withColumn("item", explode('items'))
df_parsed = df_parsed.withColumn('sku', col("item.sku")).withColumn('quantity', col('item.qty')).withColumn('price', col('item.price'))
df_selected = df_parsed.select('order_id', 'customer_id','order_date', 'sku', 'quantity', 'price')
df_selected.printSchema()
df_updated = df_selected.withColumn('total', col('quantity').cast('long') * col('price').cast('double'))
df_grouped = df_updated.groupBy('order_id').agg(sum('total').alias('order_total'))
df_grouped.show()

In [0]:
data = [
    (1, "2025-11-01 10:00:00", "OPEN", 1, "systemA"),
    (1, "2025-11-01 11:00:00", "CLOSED", 2, "systemB"),
    (1, "2025-11-01 11:00:00", "OPEN", 1, "systemC"),
    (2, "2025-11-02 09:00:00", "OPEN", 1, "systemA"),
    (2, "2025-11-02 09:05:00", "OPEN", 2, "systemA"),
    (2, "2025-11-02 09:05:00", "CLOSED", 3, "systemB"),
    (3, "2025-11-05 12:20:00", "PENDING", 1, "systemA"),
    (3, "2025-11-05 12:20:00", "PENDING", 2, "systemB"),
    (3, "2025-11-05 12:30:00", "OPEN", 1, "systemA"),
    (4, "2025-11-10 14:00:00", "RESOLVED", 5, "systemA"),
]

columns = ["id", "event_time", "status", "priority", "source"]

df = spark.createDataFrame(data, columns)
df.show()

In [0]:
#latest record per ID

from pyspark.sql.window import Window
window_spec = Window.partitionBy('id').orderBy(col('event_time').desc(), col('priority').desc())
df_win = df.withColumn('rank', dense_rank().over(window_spec)).filter(col('rank') == 1)
df_win.show()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag,col, unix_timestamp
win_spec = Window.partitionBy('id').orderBy(col('event_time'))
df_updated  = df.withColumn('lag_col', lag('event_time').over(win_spec))
df_upd_timediff = df_updated.withColumn('time_diff_min', (unix_timestamp('event_time')-unix_timestamp('lag_col'))/60)
df_prev_status = df_upd_timediff.withColumn('prev_status', lag('status').over(win_spec))
df_prev_status = df_prev_status.filter(col('prev_status').isNotNull()).select('id', 'prev_status', col('status').alias('curr_status'), 'time_diff_min')
df_prev_status.show()

In [0]:
df.show()

In [0]:
from pyspark.sql.functions import countDistinct
df_prev_status = df.withColumn('prev_status', lag('status').over(win_spec)).filter(col('prev_status').isNotNull())
df_count = df_prev_status.groupBy('id').agg(countDistinct('prev_status').alias('status_change_count'))
df_result = df.select("id").distinct() \
    .join(df_count, "id", "left") \
    .fillna(0, subset=["status_change_count"])
df_result.show()

In [0]:
from pyspark.sql import functions as F
df_int = df.withColumn('date_col', F.to_date('event_time'))
df_int.groupBy('date_col').agg(col('priority').max()).show()