In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

In [0]:
# create realistic dataframe of employee 
data = [('Alice', 'IT', 20000), 
        ('Bob', 'HR', 25000), 
        ('Charlie', 'Data Engineering', 30000), 
        ('David', 'Data Engineering', 350000),
        ('Eve', 'IT', 40000), 
        ('Frank', 'HR', 45000), 
        ('Grace', 'IT', 50000), 
        ('Hannah', 'Finance', 55000), 
        ('Ivan', 'Finance', 60000), 
        ('Judy', 'Recruiter', 65000) ]
columns = ['Name', 'Department', 'Salary']
spark_df = spark.createDataFrame(data, columns)
display(spark_df)

In [0]:
# select and filter on created dataframe
display(spark_df.select('Name'))

In [0]:
display(spark_df.select('Name', 'Salary'))

In [0]:
# filter rows based on salary
display(spark_df.select('Name').where(spark_df.Salary > 40000))

In [0]:
display(spark_df.filter(spark_df.Salary > 40000))

In [0]:
display(spark_df.filter((spark_df["Department"] == "Finance") & (spark_df["Salary"] > 55000)))


In [0]:
# create a derived column
spark_df = spark_df.withColumn('Bonus', spark_df.Salary * 0.2)

In [0]:
display(spark_df)

In [0]:
#Group by and Aggregation
#average of salary by department 

spark_df.groupBy("Department").avg("Salary").show()

In [0]:
# apply multiple aggregation
from pyspark.sql.functions import avg, max, min, sum, round
spark_df.groupBy("Department").agg(
    round(avg("Salary"), 2).alias("Average Salary"), 
    max("Salary").alias("Max Salary"), 
    min("Salary").alias("Min Salary"), 
    sum("Salary").alias("Total Salary")
).show()

In [0]:
#join() – Combine Two DataFrames
# Create dept code mapping
dept_data = [("IT", 101), ("HR", 102), ("Finance", 103)]
dept_columns = ["Department", "Dept_Code"]

df_dept = spark.createDataFrame(dept_data, dept_columns)

# Join operation
joined_df = spark_df.join(df_dept, on="Department", how="inner")
display(joined_df)


In [0]:
#Transformation V/S Action 

#Transformation
filtered = spark_df.filter(spark_df["Salary"] > 50000)

#Action
filtered.show()

### Student Practice Assignment

In [0]:
# Sample data
data = [
    ("Anurag", "IT", 70000, "Bangalore"),
    ("Priya", "HR", 55000, "Mumbai"),
    ("Ravi", "Finance", 65000, "Delhi"),
    ("Sneha", "IT", 60000, "Hyderabad")
]

columns = ["Name", "Department", "Salary", "Location"]

df_task = spark.createDataFrame(data, columns)

In [0]:
df_task.filter(df_task["Salary"] > 50000).show()

In [0]:
df_task.withColumn('Hike', df_task.Salary * 0.1).show()


In [0]:
df_task.groupBy("Department").avg("Salary").show()