In [0]:
data = [(1, 'Mahesh', 2000), (2, 'Maheer', 4000)]
schema = ('Id', 'Name', 'Salary')

df = spark.createDataFrame(data, schema)
df.show()

+---+------+------+
| Id|  Name|Salary|
+---+------+------+
|  1|Mahesh|  2000|
|  2|Maheer|  4000|
+---+------+------+



In [0]:
from pyspark.sql.functions import upper

df.withColumn('Name', upper(df.Name)).show()

+---+------+------+
| Id|  Name|Salary|
+---+------+------+
|  1|MAHESH|  2000|
|  2|MAHEER|  4000|
+---+------+------+



In [0]:
def convertNameToUpper(df):
    return df.withColumn('Name', upper(df.Name))

In [0]:
df.transform(convertNameToUpper).show()

+---+------+------+
| Id|  Name|Salary|
+---+------+------+
|  1|MAHESH|  2000|
|  2|MAHEER|  4000|
+---+------+------+



In [0]:
def doubleTheSalary(df):
    return df.withColumn('Salary', df.Salary * 2)

In [0]:
df.transform(doubleTheSalary).show()

+---+------+------+
| Id|  Name|Salary|
+---+------+------+
|  1|Mahesh|  4000|
|  2|Maheer|  8000|
+---+------+------+



In [0]:
df1 = df.transform(doubleTheSalary).transform(convertNameToUpper)
df1.show()


+---+------+------+
| Id|  Name|Salary|
+---+------+------+
|  1|MAHESH|  4000|
|  2|MAHEER|  8000|
+---+------+------+



In [0]:
##Pyspark.sql.transform -- Applicable only on Array type columns

data = [(1, 'Maheer', ['Azure', 'dotnet']), (2, 'Wafa', ['aws', 'java'])]
schema = ('Id', 'Name', 'Skills')

df = spark.createDataFrame(data, schema)
df.show()

+---+------+---------------+
| Id|  Name|         Skills|
+---+------+---------------+
|  1|Maheer|[Azure, dotnet]|
|  2|  Wafa|    [aws, java]|
+---+------+---------------+



In [0]:
from pyspark.sql.functions import transform, upper

df.select('Id', 'Name', transform('Skills', lambda x: upper(x)).alias('skills')).show()

+---+------+---------------+
| Id|  Name|         skills|
+---+------+---------------+
|  1|Maheer|[AZURE, DOTNET]|
|  2|  Wafa|    [AWS, JAVA]|
+---+------+---------------+



In [0]:
def convertToUpper(x):
    return upper(x)

df.select('Id', 'Name', transform('Skills', convertToUpper).alias('Skills')).show()

+---+------+---------------+
| Id|  Name|         Skills|
+---+------+---------------+
|  1|Maheer|[AZURE, DOTNET]|
|  2|  Wafa|    [AWS, JAVA]|
+---+------+---------------+



In [0]:
df.select('Id',  transform('Name', convertToUpper).alias('Name')).show()

#expected as pyspark.sql.functions transform works only on Array type columns


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2930750157668696>:1[0m
[0;32m----> 1[0m [43mdf[49m[38;5;241;43m.[39;49m[43mselect[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mId[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m  [49m[43mtransform[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mName[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43mconvertToUpper[49m[43m)[49m[38;5;241;43m.[39;49m[43malias[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mName[39;49m[38;5;124;43m'[39;49m[43m)[49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m-