### What is Transform??

![](Images/110/110 Transform.jpg)

### Advantages

![](Images/110/110 Advantages.jpg)

### Use Cases

![](Images/110/110 Use Cases.jpg)

### Create Sample Dataframe

In [0]:
# sample dataframe
data = [
    ('Product1', 100, 'Category1'),
    ('Product2', 200, 'Category2'),
    ('Product3', 300, 'Category1'),
]

columns = ['ProdcutName', 'Price', 'Category']

df = spark.createDataFrame(data, columns)
df.display()

### Transform Function

In [0]:
# define transformation function

def transform_function(df):
    return df.withColumn('DiscountedPrice', df.Price * 0.9)

# apply transform funciton using transform method
transformed_df = df.transform(transform_function)
transformed_df.display()

### Transform with Parameters

In [0]:
from pyspark.sql.functions import col

# sample dataframe
data = [
    ('Product1', 100, 'Category1'),
    ('Product2', 200, 'Category2'),
    ('Product3', 300, 'Category1'),
]

columns = ['ProdcutName', 'Price', 'Category']

df = spark.createDataFrame(data, columns)
df.display()

# define transformation function

def transform_function(df, discounted_percentage):
    return df.withColumn('DiscountedPrice', col('Price') * (1 - discounted_percentage / 100))
  
# parameterized discount percentage
percentage = 20

# apply transform funciton using transform method
transformed_df = df.transform(transform_function, percentage)
transformed_df2 = df.transform(transform_function, discounted_percentage=percentage)
transformed_df3 = df.transform(lambda df: transform_function(df, percentage))
transformed_df.display()
transformed_df2.display()
transformed_df3.display()

### Comparison without Transform

In [0]:
from pyspark.sql.functions import col, to_date, year, month, length, abs, when, last_day, date_format, expr

# sample dataframe
data = [
    ('TXN001', '2023-02-18', 250.75, 'Electronics', 'Bought a new phone'),
    ('TXN002', '2024-02-25', -50.50, 'Groceries', 'Refunded groceries'),
    ('TXN003', '2019-03-01', 125.00, 'Clothing', 'Purchased new Jacket'),
    ('TXN004', '2024-11-28', -10.00, 'Books', 'Refunded Book Purchase'),
]

# schema definition
schema = 'TransactionID String, TransactionDate String, Amount Double, Category String, Description String'

# create dataframe
df = spark.createDataFrame(data, schema)
df.display()

# apply transformation
df = (df
      .withColumn('TransactionDate', to_date(col('TransactionDate'), 'yyyy-MM-dd')) # 1. convert to date
      .withColumn('Year', year(col('TransactionDate'))) # 2. extract year
      .withColumn('Month', month(col('TransactionDate'))) # 3. extract month
      .withColumn('Description_Length', length(col('Description'))) # 4. lengh of the description
      .withColumn('Amount_Abs', abs(col('Amount'))) # 5. absolute value of amount
      .withColumn('Is_Refund', when(col('Amount') < 0, True).otherwise(False)) # 6. indicate if refund
      .withColumn('Last_Day_OfMonth', last_day(col('TransactionDate'))) # 7. last day of the month
      .withColumn('Formatted_Date', date_format(col('TransactionDate'), 'yyyy-MM')) # 8. format date
      .withColumn('Transaction_Size', when(col('Amount_Abs') > 100, 'Large').otherwise('small')) # 9. categorize transaction size
      .withColumn('Dynamic_Calculation',
                  expr(
                        """
                            CASE
                                WHEN Category = 'Electronics' THEN Amount * 1.10
                                WHEN Category = 'Groceries' THEN Amount * 0.90
                                ELSE Amount
                            END
                        """ 
                       ) # 10. dynamic calculation based on category
                  )
)

# show the result
df.display()


### Comparison with Transform

In [0]:
from pyspark.sql.functions import col, to_date, year, month, length, abs, when, last_day, date_format, expr

# sample dataframe
data = [
    ('TXN001', '2023-02-18', 250.75, 'Electronics', 'Bought a new phone'),
    ('TXN002', '2024-02-25', -50.50, 'Groceries', 'Refunded groceries'),
    ('TXN003', '2019-03-01', 125.00, 'Clothing', 'Purchased new Jacket'),
    ('TXN004', '2024-11-28', -10.00, 'Books', 'Refunded Book Purchase'),
]

# schema definition
schema = 'TransactionID String, TransactionDate String, Amount Double, Category String, Description String'

# create dataframe
df = spark.createDataFrame(data, schema)
df.display()

# transform function
def convert_to_date(df):
    return df.withColumn('TransactionDate', to_date(col('TransactionDate'), 'yyyy-MM-dd'))

def extract_year(df):
    return df.withColumn('Year', year(col('TransactionDate')))

def extract_month(df):
    return df.withColumn('Month', month(col('TransactionDate')))

def length_of_description(df):
    return df.withColumn('DescriptionLength', length(col('Description')))

def absolute_value_of_amount(df):
    return df.withColumn('Amount_Abs', abs(col('Amount')))

def indicate_if_refund(df):
    return df.withColumn('Is_Refund', when(col('Amount') < 0, True).otherwise(False))

def last_day_of_month(df):
    return df.withColumn('Last_Day_OfMonth', last_day(col('TransactionDate')))

def format_date(df) :
    return df.withColumn('Formatted_Date', date_format(col('TransactionDate'), 'yyyy-MM'))

def category_transaction_size(df):
    return df.withColumn('Transaction_Size', when(col('Amount_Abs') > 100, 'Large').otherwise('small'))

def dynamic_calculation(df):
    return df.withColumn('Dynamic_Calculation',
                          expr(
                        """
                            CASE
                                WHEN Category = 'Electronics' THEN Amount * 1.10
                                WHEN Category = 'Groceries' THEN Amount * 0.90
                                ELSE Amount
                            END
                        """ 
                       )
    )

# apply transformation
df = (df
      .transform(convert_to_date)
      .transform(extract_year)
      .transform(extract_month)      
      .transform(length_of_description)
      .transform(absolute_value_of_amount)
      .transform(indicate_if_refund)
      .transform(last_day_of_month)
      .transform(format_date)
      .transform(category_transaction_size)
      .transform(dynamic_calculation)
)

# show the result   
df.display()           

### Complex Example with Parameters

In [0]:
from pyspark.sql.functions import col, trim, log, when, concat_ws, lit

# sample dataframe
data = [('Alice', 34), ('Bob', 28), ('Charlie', 42), ('Dave', 50), ('Eve', 22)]

df = spark.createDataFrame(data, ['name', 'age'])

# parameters
age_thershold = 40
log_base = 10

# define transformation functions with parameters

# function to clean data by trimmimg whitespaces from the 'Name' column
def clean_data(df):
    return df.withColumn('name', trim(col('name')))

# function to categeorize 'Age' into 'Young' and 'Mature' based on age_threshold
def categorize_age(df, age_threshold):
    return df.withColumn('age_category', when(col('age') < lit(age_threshold), 'Young').otherwise('Mature'))

# function to apply logarthmic transformation to 'Age' column
def log_transform(df, log_base):
    return df.withColumn('LogAge', log(col('age')) / log(lit(log_base)))

# function to create new column 'Name_Age' by concatenating 'Name' and 'Age'
def create_name_age(df):
    return df.withColumn('name_age', concat_ws('_', col('name'), col('age')))

# apply transformation using transform function with parameters
df_transformed = (df.transform(clean_data) # apply data cleaning
            .transform(lambda df: categorize_age(df, age_thershold) # apply conditional transformation with threshold
                .transform(lambda df: log_transform(df, log_base)) # apply log transformation with base
                .transform(create_name_age) # apply name-age transformation
                ))

df_transformed.display()

### Data Cleansing Use Case

In [0]:
from pyspark.sql.functions import col, lit, when, avg, sum as _sum

# sample dataframe
data =[
    ('Alice', 34, '2023-06-01', 3000.0),
    ('Bob', 45, '2023-06-02', None),
    ('Cathy', None, '2023-06-03', 2500.0),
    ('Alice', 34, '2023-06-01', 3000.0),
    (None, 45, None, 4000.0),
]

df = spark.createDataFrame(data, ['name', 'age', 'date', 'salary'])

# define transformation functions

# fill values for null handling
name_fill = 'Unknown'
age_fill = 0
date_fill = '1900-01-01'
salary_fill = 0.0
age_threshold = 30
salary_threshold = 2000
bonus_percentage = 0.10

# function to fill null values 
def handle_null(df, name_fill, age_fill, date_fill, salary_fill):
    return df.fillna({
        'name':name_fill,
        'age':age_fill,
        'date':date_fill,
        'salary':salary_fill
    })

# function to remove duplicates
def remove_duplicates(df):
    return df.dropDuplicates()

# function to standardize data types
def standardize_data_types(df):
    return df.withColumn('age', col('age').cast('integer')) \
             .withColumn('date', col('date').cast('date')) \
             .withColumn('salary', col('salary').cast('double'))

# function to filter rows based on conditions
def filter_rows(df, age_threshold, salary_threshold):
    return df.filter((col('age') >= lit(age_threshold)) & (col('salary') >= lit(salary_threshold)))

# function to add new calculated colum 'bonus'
def add_bonus(df, bonus_percentage):
    return df.withColumn('bonus', col('salary') * lit(bonus_percentage))

# function to perfrom group by and aggregate results
def group_and_aggregate(df):
    return df.groupBy('name').agg(
        avg('age').alias('avg_age'),
        _sum('salary').alias('total_salary'),
        _sum('bonus').alias('total_bonus')
    )

# apply transformation using transform function with parameters
df_transformed = (df.transform(lambda df: handle_null(df, name_fill, age_fill, date_fill, salary_fill)) # apply null handling
            .transform(remove_duplicates) # apply duplicate removal
            .transform(standardize_data_types) # apply data type standardization
            .transform(lambda df: filter_rows(df, age_threshold, salary_threshold)) # apply filtering with threshold
            .transform(lambda df: add_bonus(df, bonus_percentage)) # apply bonus calculation
            .transform(group_and_aggregate) # apply group by and aggregate
            )

df_transformed.display()