In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("example").getOrCreate()
from pyspark.sql import Row



In [2]:
data = [
    Row(first_name="John", last_name="Doe", salary=50000),
    Row(first_name="Jane", last_name="Smith", salary=60000),
    Row(first_name="Alice", last_name="Johnson", salary=70000),
    Row(first_name="Bob", last_name="Brown", salary=55000)
]

# Create a DataFrame from the list of Row objects
df = spark.createDataFrame(data)
df.select(df["first_name"], df["last_name"], df["salary"]).show()

+----------+---------+------+
|first_name|last_name|salary|
+----------+---------+------+
|      John|      Doe| 50000|
|      Jane|    Smith| 60000|
|     Alice|  Johnson| 70000|
|       Bob|    Brown| 55000|
+----------+---------+------+



In [3]:
df.show()

+----------+---------+------+
|first_name|last_name|salary|
+----------+---------+------+
|      John|      Doe| 50000|
|      Jane|    Smith| 60000|
|     Alice|  Johnson| 70000|
|       Bob|    Brown| 55000|
+----------+---------+------+



In [4]:
df.select("first_name", "last_name").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|      John|      Doe|
|      Jane|    Smith|
|     Alice|  Johnson|
|       Bob|    Brown|
+----------+---------+



In [5]:
from pyspark.sql import functions as f

result = df.groupBy("first_name").agg(
    f.count("*").alias("count"),
    f.avg("salary").alias("avg_salary"),
    f.max("salary").alias("max_salary")
)

In [6]:
result.show()

+----------+-----+----------+----------+
|first_name|count|avg_salary|max_salary|
+----------+-----+----------+----------+
|      John|    1|   50000.0|     50000|
|      Jane|    1|   60000.0|     60000|
|     Alice|    1|   70000.0|     70000|
|       Bob|    1|   55000.0|     55000|
+----------+-----+----------+----------+



In [7]:
from pyspark.sql.functions import *
add_column = df.withColumn(
    "department",
    lit("sales")
)
add_column.show()

+----------+---------+------+----------+
|first_name|last_name|salary|department|
+----------+---------+------+----------+
|      John|      Doe| 50000|     sales|
|      Jane|    Smith| 60000|     sales|
|     Alice|  Johnson| 70000|     sales|
|       Bob|    Brown| 55000|     sales|
+----------+---------+------+----------+



In [8]:
data = [
    Row(emp_id=1, first_name="John", last_name="Doe", salary=50000, department_name="Sales", manager_id=10, manager_name="Alice"),
    Row(emp_id=2, first_name="Jane", last_name="Smith", salary=60000, department_name="Engineering", manager_id=20, manager_name="Bob"),
    Row(emp_id=3, first_name="Alice", last_name="Johnson", salary=70000, department_name="HR", manager_id=30, manager_name="Charlie"),
    Row(emp_id=4, first_name="Bob", last_name="Brown", salary=55000, department_name="Sales", manager_id=10, manager_name="Alice"),
    Row(emp_id=5, first_name="Eve", last_name="White", salary=65000, department_name="Engineering", manager_id=20, manager_name="Bob")
]

# Create DataFrame from the sample data
df_emp = spark.createDataFrame(data)

In [9]:
df_emp.show()

+------+----------+---------+------+---------------+----------+------------+
|emp_id|first_name|last_name|salary|department_name|manager_id|manager_name|
+------+----------+---------+------+---------------+----------+------------+
|     1|      John|      Doe| 50000|          Sales|        10|       Alice|
|     2|      Jane|    Smith| 60000|    Engineering|        20|         Bob|
|     3|     Alice|  Johnson| 70000|             HR|        30|     Charlie|
|     4|       Bob|    Brown| 55000|          Sales|        10|       Alice|
|     5|       Eve|    White| 65000|    Engineering|        20|         Bob|
+------+----------+---------+------+---------------+----------+------------+



In [10]:
df_emp.select("first_name", "last_name", "salary")\
    .where((df_emp["salary"] > 50000) & (df_emp["department_name"] == "Engineering"))\
.show()

+----------+---------+------+
|first_name|last_name|salary|
+----------+---------+------+
|      Jane|    Smith| 60000|
|       Eve|    White| 65000|
+----------+---------+------+



In [21]:
data_employees = [
    Row(emp_id=1, first_name="John", last_name="Doe", salary=50000, department_name="Sales", manager_id=10),
    Row(emp_id=2, first_name="Jane", last_name="Smith", salary=60000, department_name="Engineering", manager_id=20),
    Row(emp_id=3, first_name="Alice", last_name="Johnson", salary=70000, department_name="HR", manager_id=30),
    Row(emp_id=4, first_name="Bob", last_name="Brown", salary=55000, department_name="Sales", manager_id=10),
    Row(emp_id=5, first_name="Eve", last_name="White", salary=65000, department_name="Engineering", manager_id=20)
]

df_employees = spark.createDataFrame(data_employees)
df_employees.show()

data_managers = [
    Row(manager_id=10, manager_name="Alice"),
    Row(manager_id=20, manager_name="Bob"),
    Row(manager_id=30, manager_name="Charlie")
]

df_managers = spark.createDataFrame(data_managers)
df_managers.show()


+------+----------+---------+------+---------------+----------+
|emp_id|first_name|last_name|salary|department_name|manager_id|
+------+----------+---------+------+---------------+----------+
|     1|      John|      Doe| 50000|          Sales|        10|
|     2|      Jane|    Smith| 60000|    Engineering|        20|
|     3|     Alice|  Johnson| 70000|             HR|        30|
|     4|       Bob|    Brown| 55000|          Sales|        10|
|     5|       Eve|    White| 65000|    Engineering|        20|
+------+----------+---------+------+---------------+----------+

+----------+------------+
|manager_id|manager_name|
+----------+------------+
|        10|       Alice|
|        20|         Bob|
|        30|     Charlie|
+----------+------------+



In [22]:
df_joined = df_employees.join(df_managers, df_employees["manager_id"] == df_managers["manager_id"], "inner")
df_joined.show()

+------+----------+---------+------+---------------+----------+----------+------------+
|emp_id|first_name|last_name|salary|department_name|manager_id|manager_id|manager_name|
+------+----------+---------+------+---------------+----------+----------+------------+
|     1|      John|      Doe| 50000|          Sales|        10|        10|       Alice|
|     4|       Bob|    Brown| 55000|          Sales|        10|        10|       Alice|
|     2|      Jane|    Smith| 60000|    Engineering|        20|        20|         Bob|
|     5|       Eve|    White| 65000|    Engineering|        20|        20|         Bob|
|     3|     Alice|  Johnson| 70000|             HR|        30|        30|     Charlie|
+------+----------+---------+------+---------------+----------+----------+------------+



In [23]:
df_joined = df_employees.join(df_managers, df_employees["manager_id"] == df_managers["manager_id"], "left")
df_joined.show()

+------+----------+---------+------+---------------+----------+----------+------------+
|emp_id|first_name|last_name|salary|department_name|manager_id|manager_id|manager_name|
+------+----------+---------+------+---------------+----------+----------+------------+
|     1|      John|      Doe| 50000|          Sales|        10|        10|       Alice|
|     2|      Jane|    Smith| 60000|    Engineering|        20|        20|         Bob|
|     3|     Alice|  Johnson| 70000|             HR|        30|        30|     Charlie|
|     4|       Bob|    Brown| 55000|          Sales|        10|        10|       Alice|
|     5|       Eve|    White| 65000|    Engineering|        20|        20|         Bob|
+------+----------+---------+------+---------------+----------+----------+------------+



In [24]:
department_count_df = df_employees.groupBy("department_name").agg(
    F.count("*").alias("employee_count")
)
department_count_df.show()

+---------------+--------------+
|department_name|employee_count|
+---------------+--------------+
|          Sales|             2|
|    Engineering|             2|
|             HR|             1|
+---------------+--------------+



In [14]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

data = [
    (1, "John", "Doe", 50000),
    (2, "Jane", "Smith", 60000),
    (3, "Alice", "Johnson", 70000),
    (4, "Bob", "Brown", 55000),
    (5, "Eve", "White", 65000)
]

In [15]:
df_employees = spark.createDataFrame(data, ["emp_id", "first_name", "last_name", "salary"])
df_employees.show()

+------+----------+---------+------+
|emp_id|first_name|last_name|salary|
+------+----------+---------+------+
|     1|      John|      Doe| 50000|
|     2|      Jane|    Smith| 60000|
|     3|     Alice|  Johnson| 70000|
|     4|       Bob|    Brown| 55000|
|     5|       Eve|    White| 65000|
+------+----------+---------+------+



In [16]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

window_spec = Window.orderBy(F.col("salary").desc())

df_with_rank = df_employees.withColumn("rank", F.dense_rank().over(window_spec))
second_highest = df_with_rank.filter(df_with_rank["rank"] == 2).select("first_name", "last_name", "salary")

second_highest.show()

+----------+---------+------+
|first_name|last_name|salary|
+----------+---------+------+
|       Eve|    White| 65000|
+----------+---------+------+



In [42]:
df_employees.show()

+------+----------+---------+------+---------------+----------+------------+
|emp_id|first_name|last_name|salary|department_name|manager_id|manager_name|
+------+----------+---------+------+---------------+----------+------------+
|     1|      John|      Doe| 50000|          Sales|        10|       Alice|
|     2|      Jane|    Smith| 60000|    Engineering|        20|         Bob|
|     3|     Alice|  Johnson| 70000|             HR|        30|     Charlie|
|     4|       Bob|    Brown| 55000|          Sales|        10|       Alice|
|     5|       Eve|    White| 65000|    Engineering|        20|         Bob|
+------+----------+---------+------+---------------+----------+------------+



In [45]:
window_row = Window.partitionBy("department_name").orderBy("salary")

df_row_num = df_employees.withColumn("rank", F.row_number().over(window_row))
result_df = df_row_num.select("first_name", "last_name", "salary","department_name", "rank")
result_df.show()

+----------+---------+------+---------------+----+
|first_name|last_name|salary|department_name|rank|
+----------+---------+------+---------------+----+
|      Jane|    Smith| 60000|    Engineering|   1|
|       Eve|    White| 65000|    Engineering|   2|
|     Alice|  Johnson| 70000|             HR|   1|
|      John|      Doe| 50000|          Sales|   1|
|       Bob|    Brown| 55000|          Sales|   2|
+----------+---------+------+---------------+----+



In [49]:
df_case_when = df_employees.withColumn(
            "salary_category", 
            F.when(df_employees["salary"] > 10000, "High")
             .when(df_employees["salary"] == 70000,  "low")
             .otherwise("heavy")
            )
df_case_when.show()

+------+----------+---------+------+---------------+----------+------------+---------------+
|emp_id|first_name|last_name|salary|department_name|manager_id|manager_name|salary_category|
+------+----------+---------+------+---------------+----------+------------+---------------+
|     1|      John|      Doe| 50000|          Sales|        10|       Alice|           High|
|     2|      Jane|    Smith| 60000|    Engineering|        20|         Bob|           High|
|     3|     Alice|  Johnson| 70000|             HR|        30|     Charlie|           High|
|     4|       Bob|    Brown| 55000|          Sales|        10|       Alice|           High|
|     5|       Eve|    White| 65000|    Engineering|        20|         Bob|           High|
+------+----------+---------+------+---------------+----------+------------+---------------+



In [52]:
spark

In [19]:
df = spark.range(1).select(F.date_format(F.current_date(), "yyyy-MM-dd").alias("formatted_date"))
df.show()

+--------------+
|formatted_date|
+--------------+
|    2025-01-22|
+--------------+



In [3]:
from pyspark.sql.functions import col

data = [
    (1, "Alice", 3),
    (2, "Bob", 3),
    (3, "Charlie", None),
    (4, "David", 2)
]

# Define schema
columns = ["employee_id", "employee_name", "manager_id"]

# Create DataFrame
df_employees = spark.createDataFrame(data, columns)

# Perform self-join
# We join df_employees with itself using aliases to differentiate between the employee and manager sides
df_with_manager = df_employees.alias("emp").join(df_employees.alias("mgr"), col("emp.manager_id") == col("mgr.employee_id"),"left" )

# Select relevant columns (e.g., employee name and manager name)
df_with_manager = df_with_manager.select(
    col("emp.employee_name").alias("employee_name"),
    col("mgr.employee_name").alias("manager_name")
)

# Show the result
df_with_manager.show()

+-------------+------------+
|employee_name|manager_name|
+-------------+------------+
|        Alice|     Charlie|
|          Bob|     Charlie|
|      Charlie|        null|
|        David|         Bob|
+-------------+------------+

