In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataXBootcamp").getOrCreate()

In [0]:
#create realistic DataFrame of Employee
#This is tuple ("Alice","IT", 50000) having diffrent kind of data
data={
    ("Alice","IT", 50000),
    ("Naina","Data Engineer", 90000),
    ("Ria","HR", 53000),
    ("Shyam","Recruiter", 65000),
    ("Priya","HR", 30000),
    ("Bob","IT", 85000),
    ("Stuti","Finace", 51000),
    ("Paul","HR", 65000),
    ("Sid","Finace", 40000),
    ("Maddy","Recruiter", 50000),
}

column=["Name", "Department", "Salary"]
df=spark.createDataFrame(data, column)
display(df)

In [0]:
#Select and filter on the DataFrame created
display(df.select("Name"))

In [0]:
display(df.select("Salary", "Name"))

In [0]:
#filter rows based on salary
display(df.filter(df.Salary>50000))

In [0]:
# we can do something like this df['Salary] insted of df.Salary
display(df.filter(df["Salary"]>50000))

In [0]:
#Apply multiple filters
display(df.filter((df['Department']=='IT') & (df['Salary']>50000)))

In [0]:
#Create a derived column
from pyspark.sql.functions import col

#Add 10% bouns to everyone
df=df.withColumn("Bonus",col("Salary")*0.1)
display(df)

In [0]:
#Group by and Aggregation
# average of salary by department
df.groupBy("Department").avg("Salary").show()

In [0]:
# Apply Multiple aggregation
from pyspark.sql.functions import avg, max, min

display(df.groupBy("Department").agg(
    avg("Salary").alias("Avg_Salary"),
    max("Salary").alias("Max_Salary"),
    min("Salary").alias("Min_Salary")
))

In [0]:
# Creating another DataFrame
dept_data = [("IT",101), ("HR",102), ("Fiance", 103)]
dept_columns = ["Department", "Dept_code"]
df_dept = spark.createDataFrame(dept_data, dept_columns)

display(df_dept)

In [0]:
# Combine two data frame (join operation)
joined_df = df.join(df_dept, on="Department", how="inner")
display(joined_df)

In [0]:
# Transformations v/s Action

#Transformation
filtered = df.filter(df["Salary"]>50000)

#Action
filtered.show()

## Assignment 

In [0]:
df_data = [("Naina", "IT", 90000, "Noida"),("John", "HR", 50000, "Gurugram"),
("Priya", "Finace", 60000, "Pune"),
("Riya", "Finace", 80000, "Pune"),
("Rohit", "IT", 65000, "Banglore"),
("Alex", "HR", 85000, "Gurugram"),
("Krishna", "Finace", 90000, "Jaipur"),
("Ram", "IT", 30000, "Banglore"),
("Shyam", "HR", 45000, "Noida"),]

df_column = ["Name", "Department", "Salary", "Location"]

new_df = spark.createDataFrame(df_data, df_column)
display(new_df)

In [0]:
#Filter IT employees with salary > 60k
display(new_df.filter((col("Department") == "IT") & (col("Salary") > 60000)))

In [0]:
#Add column "Hike_Amount" as 15% of salary
df_task = new_df.withColumn("Hike_Amount", col("Salary")*0.15)
display(df_task)  # or df_task.show()

In [0]:
#Group By department and show average salary
display(new_df.groupBy("Department").avg("Salary"))