In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a Spark session
spark = SparkSession.builder.appName("withColumnExample").getOrCreate()

data = [(1, "Alice", 25), (2, "Bob", 30), (3, "Charlie", 35)]
columns = ["id", "name", "age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Display the original DataFrame
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [10]:
# Rename the 'age' column to 'years'
df = df.withColumn("years", col("age"))

# Drop the original 'age' column
df = df.drop("age")

# Display the updated DataFrame
df.show()

+---+-------+-----+
| id|   name|years|
+---+-------+-----+
|  1|  Alice|   25|
|  2|    Bob|   30|
|  3|Charlie|   35|
+---+-------+-----+



In [11]:
from pyspark.sql.functions import expr

# Convert 'years' to 'months'
df = df.withColumn("months", expr("years * 12"))

# Display the updated DataFrame
df.show()

+---+-------+-----+------+
| id|   name|years|months|
+---+-------+-----+------+
|  1|  Alice|   25|   300|
|  2|    Bob|   30|   360|
|  3|Charlie|   35|   420|
+---+-------+-----+------+



In [12]:
from pyspark.sql.types import StringType

# Change the data type of the 'id' column to string
df = df.withColumn("id", col("id").cast(StringType()))

# Display the updated DataFrame
df.show()

+---+-------+-----+------+
| id|   name|years|months|
+---+-------+-----+------+
|  1|  Alice|   25|   300|
|  2|    Bob|   30|   360|
|  3|Charlie|   35|   420|
+---+-------+-----+------+



In [13]:
from pyspark.sql.functions import when

data = [(1, "Alice", 25, 45000), (2, "Bob", 30, 55000), (3, "Charlie", 35, 60000)]
columns = ["id", "name", "age", "salary"]

# Create a new DataFrame
df = spark.createDataFrame(data, columns)
df.show()

# Add a new 'tax' column based on the 'salary' column
df = df.withColumn("tax", when(col("salary") >= 50000, col("salary") * 0.1).otherwise(col("salary") * 0.05))

# Display the updated DataFrame
df.show()

+---+-------+---+------+
| id|   name|age|salary|
+---+-------+---+------+
|  1|  Alice| 25| 45000|
|  2|    Bob| 30| 55000|
|  3|Charlie| 35| 60000|
+---+-------+---+------+

+---+-------+---+------+------+
| id|   name|age|salary|   tax|
+---+-------+---+------+------+
|  1|  Alice| 25| 45000|2250.0|
|  2|    Bob| 30| 55000|5500.0|
|  3|Charlie| 35| 60000|6000.0|
+---+-------+---+------+------+



In [14]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def age_group(age):
    if age < 30:
        return "Young"
    elif age < 45:
        return "Middle-aged"
    else:
        return "Old"

# Register the UDF
age_group_udf = udf(age_group, StringType())

In [15]:
# Add a new 'age_group' column based on the 'age' column
df = df.withColumn("age_group", age_group_udf(col("age")))

# Display the updated DataFrame
df.show()

+---+-------+---+------+------+-----------+
| id|   name|age|salary|   tax|  age_group|
+---+-------+---+------+------+-----------+
|  1|  Alice| 25| 45000|2250.0|      Young|
|  2|    Bob| 30| 55000|5500.0|Middle-aged|
|  3|Charlie| 35| 60000|6000.0|Middle-aged|
+---+-------+---+------+------+-----------+



In [16]:
from pyspark.sql.functions import round

# Add a new 'net_salary' column by subtracting 'tax' from 'salary'
df = df.withColumn("net_salary", round(col("salary") - col("tax"), 2))

# Display the updated DataFrame
df.show()

+---+-------+---+------+------+-----------+----------+
| id|   name|age|salary|   tax|  age_group|net_salary|
+---+-------+---+------+------+-----------+----------+
|  1|  Alice| 25| 45000|2250.0|      Young|   42750.0|
|  2|    Bob| 30| 55000|5500.0|Middle-aged|   49500.0|
|  3|Charlie| 35| 60000|6000.0|Middle-aged|   54000.0|
+---+-------+---+------+------+-----------+----------+



In [17]:
from pyspark.sql.functions import concat_ws

# Combine 'name' and 'age_group' columns into a new 'name_age_group' column
df = df.withColumn("name_age_group", concat_ws(" - ", col("name"), col("age_group")))

# Display the updated DataFrame
df.show()

+---+-------+---+------+------+-----------+----------+--------------------+
| id|   name|age|salary|   tax|  age_group|net_salary|      name_age_group|
+---+-------+---+------+------+-----------+----------+--------------------+
|  1|  Alice| 25| 45000|2250.0|      Young|   42750.0|       Alice - Young|
|  2|    Bob| 30| 55000|5500.0|Middle-aged|   49500.0|   Bob - Middle-aged|
|  3|Charlie| 35| 60000|6000.0|Middle-aged|   54000.0|Charlie - Middle-...|
+---+-------+---+------+------+-----------+----------+--------------------+

