<a href="https://colab.research.google.com/github/nitiksha/PySpark_code_practice/blob/main/emp_ques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

# Start Spark session
spark = SparkSession.builder.getOrCreate()

# Define schema
schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True)
])

# Sample data
data = [
    (101, "Aditi", "HR", None),
    (101, "Aditi", "Engineering", 72000),
    (103, "Neha", "HR", 60000),
    (104, "Arjun", "Sales", 48000),
    (105, "Priya", "Engineering", 75000),
    (106, "Karan", "Sales", 50000),
    (107, "Meera", "Engineering", 71000),
    (108, "Ishaan", "HR", 50000),
    (109, "Tanya", "Engineering", 73000),
    (110, "Varun", "Engineering", 69000)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show()

+------+------+-----------+------+
|emp_id|  name| department|salary|
+------+------+-----------+------+
|   101| Aditi|         HR|  NULL|
|   101| Aditi|Engineering| 72000|
|   103|  Neha|         HR| 60000|
|   104| Arjun|      Sales| 48000|
|   105| Priya|Engineering| 75000|
|   106| Karan|      Sales| 50000|
|   107| Meera|Engineering| 71000|
|   108|Ishaan|         HR| 50000|
|   109| Tanya|Engineering| 73000|
|   110| Varun|Engineering| 69000|
+------+------+-----------+------+



In [24]:
dedup_df = df.dropDuplicates(["emp_id","name"])
dedup_df.show()

+------+------+-----------+------+
|emp_id|  name| department|salary|
+------+------+-----------+------+
|   101| Aditi|         HR|  NULL|
|   103|  Neha|         HR| 60000|
|   104| Arjun|      Sales| 48000|
|   105| Priya|Engineering| 75000|
|   106| Karan|      Sales| 50000|
|   107| Meera|Engineering| 71000|
|   108|Ishaan|         HR| 50000|
|   109| Tanya|Engineering| 73000|
|   110| Varun|Engineering| 69000|
+------+------+-----------+------+



In [28]:
dropnull_df = df.dropna(subset=["salary"])
dropnull_df.show()
filled_df = df.fillna({"salary":0})
filled_df.show()

+------+------+-----------+------+
|emp_id|  name| department|salary|
+------+------+-----------+------+
|   101| Aditi|Engineering| 72000|
|   103|  Neha|         HR| 60000|
|   104| Arjun|      Sales| 48000|
|   105| Priya|Engineering| 75000|
|   106| Karan|      Sales| 50000|
|   107| Meera|Engineering| 71000|
|   108|Ishaan|         HR| 50000|
|   109| Tanya|Engineering| 73000|
|   110| Varun|Engineering| 69000|
+------+------+-----------+------+

+------+------+-----------+------+
|emp_id|  name| department|salary|
+------+------+-----------+------+
|   101| Aditi|         HR|     0|
|   101| Aditi|Engineering| 72000|
|   103|  Neha|         HR| 60000|
|   104| Arjun|      Sales| 48000|
|   105| Priya|Engineering| 75000|
|   106| Karan|      Sales| 50000|
|   107| Meera|Engineering| 71000|
|   108|Ishaan|         HR| 50000|
|   109| Tanya|Engineering| 73000|
|   110| Varun|Engineering| 69000|
+------+------+-----------+------+



In [10]:
from pyspark.sql.functions import *

In [11]:
filter_df = df.filter(col("salary")>50000)
filter_df.show()

+------+-----+-----------+------+
|emp_id| name| department|salary|
+------+-----+-----------+------+
|   101|Aditi|         HR| 55000|
|   102| Ravi|Engineering| 72000|
|   103| Neha|         HR| 60000|
|   105|Priya|Engineering| 75000|
|   107|Meera|Engineering| 71000|
|   109|Tanya|Engineering| 73000|
|   110|Varun|Engineering| 69000|
+------+-----+-----------+------+



In [12]:
df = df.groupBy("department").agg(avg("salary").alias("avg"),count("*"))

In [13]:
df.show()

+-----------+-------+--------+
| department|    avg|count(1)|
+-----------+-------+--------+
|      Sales|49000.0|       2|
|Engineering|72000.0|       5|
|         HR|55000.0|       3|
+-----------+-------+--------+

