# WithColumns and withcolumnRenamed

In [0]:
# write with different modes
# show(n=28)

In [0]:
data = [
    (1, 'mahendra', 23000, 23), 
    (2, 'mahi', 34444, 45),
    (3, 'sam', 34299, 22)
]

In [0]:
df=spark.createDataFrame(data, schema=['id', 'name', 'salary', 'age'])
help(df.withColumn)

Help on method withColumn in module pyspark.sql.dataframe:

withColumn(colName: str, col: pyspark.sql.column.Column) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by adding a column or replacing the
    existing column that has the same name.
    
    The column expression must be an expression over this :class:`DataFrame`; attempting to add
    a column from some other :class:`DataFrame` will raise an error.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    colName : str
        string, name of the new column.
    col : :class:`Column`
        a :class:`Column` expression for the new column.
    
    Returns
    -------
    :class:`DataFrame`
        DataFrame with new or replaced column.
    
    Notes
    -----
    This method introduces a projection internally. Therefore, calling it multiple
    times, for instance, via loops in order to a

In [0]:
df.show(n=10)

+---+--------+------+---+
| id|    name|salary|age|
+---+--------+------+---+
|  1|mahendra| 23000| 23|
|  2|    mahi| 34444| 45|
|  3|     sam| 34299| 22|
+---+--------+------+---+



In [0]:
from pyspark.sql.functions import *

In [0]:
import pyspark.sql.functions as F

print(dir(F))




In [0]:
df.columns

Out[12]: ['id', 'name', 'salary', 'age']

In [0]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)



In [0]:
df1=df.withColumn(colName='salary', col=col('salary').cast('Integer'))
df1.show()

+---+--------+------+---+
| id|    name|salary|age|
+---+--------+------+---+
|  1|mahendra| 23000| 23|
|  2|    mahi| 34444| 45|
|  3|     sam| 34299| 22|
+---+--------+------+---+



In [0]:
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: long (nullable = true)



In [0]:
df1=df1.withColumnRenamed('salary', 'salary_emp')

In [0]:
df1.show()

+---+--------+----------+---+
| id|    name|salary_emp|age|
+---+--------+----------+---+
|  1|mahendra|     23000| 23|
|  2|    mahi|     34444| 45|
|  3|     sam|     34299| 22|
+---+--------+----------+---+



In [0]:
df1=df1.withColumn('salary_emp', col('salary_emp')*1.2)

In [0]:
df1.show()

+---+--------+------------------+---+
| id|    name|        salary_emp|age|
+---+--------+------------------+---+
|  1|mahendra|           27600.0| 23|
|  2|    mahi|41332.799999999996| 45|
|  3|     sam|41158.799999999996| 22|
+---+--------+------------------+---+



In [0]:
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary_emp: double (nullable = true)
 |-- age: long (nullable = true)



In [0]:
from pyspark.sql.functions import lit
df1=df1.withColumn('location', lit('USA'))

In [0]:
df1.show()

+---+--------+------------------+---+--------+
| id|    name|        salary_emp|age|location|
+---+--------+------------------+---+--------+
|  1|mahendra|           27600.0| 23|     USA|
|  2|    mahi|41332.799999999996| 45|     USA|
|  3|     sam|41158.799999999996| 22|     USA|
+---+--------+------------------+---+--------+



In [0]:
from pyspark.sql.functions import when, lit

df1 = df1.withColumn(
    "location",
    when(df1["age"] > 30, lit("USA")).otherwise(lit("Unknown"))
)


In [0]:
df1.show()

+---+--------+------------------+---+--------+
| id|    name|        salary_emp|age|location|
+---+--------+------------------+---+--------+
|  1|mahendra|           27600.0| 23| Unknown|
|  2|    mahi|41332.799999999996| 45|     USA|
|  3|     sam|41158.799999999996| 22| Unknown|
+---+--------+------------------+---+--------+



In [0]:
df1.show()

+---+--------+------------------+---+--------+
| id|    name|        salary_emp|age|location|
+---+--------+------------------+---+--------+
|  1|mahendra|           27600.0| 23| Unknown|
|  2|    mahi|41332.799999999996| 45|     USA|
|  3|     sam|41158.799999999996| 22| Unknown|
+---+--------+------------------+---+--------+



In [0]:
df1=df1.withColumn('agecopied', col('age'))

In [0]:
df1.show()

+---+--------+------------------+---+--------+---------+
| id|    name|        salary_emp|age|location|agecopied|
+---+--------+------------------+---+--------+---------+
|  1|mahendra|           27600.0| 23| Unknown|       23|
|  2|    mahi|41332.799999999996| 45|     USA|       45|
|  3|     sam|41158.799999999996| 22| Unknown|       22|
+---+--------+------------------+---+--------+---------+



# Array type columns

In [0]:
data1 = [[1, 'mahendra', [3000, 3400]], [2, 'mahi', [3880, 8790]],
        [3, 'sam', [3998, 2388]]]

In [0]:
df=spark.createDataFrame(data1, schema=['id', 'name', 'salary'])

In [0]:
df.show()

+---+--------+------------+
| id|    name|      salary|
+---+--------+------------+
|  1|mahendra|[3000, 3400]|
|  2|    mahi|[3880, 8790]|
|  3|     sam|[3998, 2388]|
+---+--------+------------+



In [0]:
from pyspark.sql import functions as F

In [0]:
dir(F)

Out[38]: ['Any',
 'ArrayType',
 'Callable',
 'Column',
 'DataFrame',
 'DataType',
 'DeprecatedFuncUsageLogger',
 'Dict',
 'Iterable',
 'List',
 'Optional',
 'PandasUDFType',
 'PySparkTypeError',
 'PySparkValueError',
 'PythonEvalType',
 'SparkContext',
 'StringType',
 'StructType',
 'TYPE_CHECKING',
 'Tuple',
 'Union',
 'UserDefinedFunction',
 'ValuesView',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_create_column_from_literal',
 '_create_edge_udf',
 '_create_lambda',
 '_create_udf',
 '_get_jvm_function',
 '_get_lambda_parameters',
 '_invoke_binary_math_function',
 '_invoke_function',
 '_invoke_function_over_columns',
 '_invoke_function_over_seq_of_columns',
 '_invoke_higher_order_function',
 '_options_to_str',
 '_test',
 '_to_java_column',
 '_to_seq',
 '_unresolved_named_lambda_variable',
 'abs',
 'acos',
 'acosh',
 'add_months',
 'aggregate',
 'approxCountDistinct',
 'approx_count_distinct',
 'array',
 'array_ap

In [0]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [0]:
df_exploded.show()

+---+--------+------+
| id|    name|salary|
+---+--------+------+
|  1|mahendra|  3000|
|  1|mahendra|  3400|
|  2|    mahi|  3880|
|  2|    mahi|  8790|
|  3|     sam|  3998|
|  3|     sam|  2388|
+---+--------+------+



In [0]:
df.show()

+---+--------+------------+
| id|    name|      salary|
+---+--------+------------+
|  1|mahendra|[3000, 3400]|
|  2|    mahi|[3880, 8790]|
|  3|     sam|[3998, 2388]|
+---+--------+------------+



In [0]:
df=df.withColumn('salary1', col('salary')[0])

In [0]:
df.display()

id,name,salary,salary1
1,mahendra,"List(3000, 3400)",3000
2,mahi,"List(3880, 8790)",3880
3,sam,"List(3998, 2388)",3998


In [0]:
df2=df.withColumn('salary2', col('salary')[1])

In [0]:
df2.show()

+---+--------+------------+-------+-------+
| id|    name|      salary|salary1|salary2|
+---+--------+------------+-------+-------+
|  1|mahendra|[3000, 3400]|   3000|   3400|
|  2|    mahi|[3880, 8790]|   3880|   8790|
|  3|     sam|[3998, 2388]|   3998|   2388|
+---+--------+------------+-------+-------+



In [0]:
from pyspark.sql.functions import col, array
df3=df2.withColumn('salaries', array(col('salary1'), col('salary2')))

In [0]:
df3.show()

+---+--------+------------+-------+-------+------------+
| id|    name|      salary|salary1|salary2|    salaries|
+---+--------+------------+-------+-------+------------+
|  1|mahendra|[3000, 3400]|   3000|   3400|[3000, 3400]|
|  2|    mahi|[3880, 8790]|   3880|   8790|[3880, 8790]|
|  3|     sam|[3998, 2388]|   3998|   2388|[3998, 2388]|
+---+--------+------------+-------+-------+------------+



In [0]:
df4 = df3.withColumn('salaryequalto3k', array_contains(col('salary'), 3000))

In [0]:
df4.show()

+---+--------+------------+-------+-------+------------+---------------+
| id|    name|      salary|salary1|salary2|    salaries|salaryequalto3k|
+---+--------+------------+-------+-------+------------+---------------+
|  1|mahendra|[3000, 3400]|   3000|   3400|[3000, 3400]|           true|
|  2|    mahi|[3880, 8790]|   3880|   8790|[3880, 8790]|          false|
|  3|     sam|[3998, 2388]|   3998|   2388|[3998, 2388]|          false|
+---+--------+------------+-------+-------+------------+---------------+



In [0]:
df.show()

+---+--------+------------+-------+
| id|    name|      salary|salary1|
+---+--------+------------+-------+
|  1|mahendra|[3000, 3400]|   3000|
|  2|    mahi|[3880, 8790]|   3880|
|  3|     sam|[3998, 2388]|   3998|
+---+--------+------------+-------+



In [0]:
from pyspark.sql.functions import explode, split
df_exploded = df.select("id", 'name', explode("salary").alias('salary'))

In [0]:
df_exploded.show()

+---+--------+------+
| id|    name|salary|
+---+--------+------+
|  1|mahendra|  3000|
|  1|mahendra|  3400|
|  2|    mahi|  3880|
|  2|    mahi|  8790|
|  3|     sam|  3998|
|  3|     sam|  2388|
+---+--------+------+



In [0]:
from pyspark.sql.types import *
schema = StructType([
    StructField("name", StringType(), True),
    StructField("salaries", ArrayType(IntegerType()), True)
])

In [0]:
data2 = [[1, 'mahi', 'data, dataengineer'], 
         [2, 'sam', 'IT, software']]

In [0]:
dfx = spark.createDataFrame(data2, schema=['id', 'name', 'dept_role'])

In [0]:
dfx.show()

+---+----+------------------+
| id|name|         dept_role|
+---+----+------------------+
|  1|mahi|data, dataengineer|
|  2| sam|      IT, software|
+---+----+------------------+



In [0]:
from pyspark.sql.functions import split
dfx=dfx.withColumn('dept_role_list', split(col('dept_role'), ','))

In [0]:
dfx.show(truncate=True)

+---+----+------------------+--------------------+
| id|name|         dept_role|      dept_role_list|
+---+----+------------------+--------------------+
|  1|mahi|data, dataengineer|[data,  dataengin...|
|  2| sam|      IT, software|     [IT,  software]|
+---+----+------------------+--------------------+



In [0]:
dfx.show()

+---+----+------------------+--------------------+
| id|name|         dept_role|      dept_role_list|
+---+----+------------------+--------------------+
|  1|mahi|data, dataengineer|[data,  dataengin...|
|  2| sam|      IT, software|     [IT,  software]|
+---+----+------------------+--------------------+



In [0]:
dfx.show(truncate=False)

+---+----+------------------+---------------------+
|id |name|dept_role         |dept_role_list       |
+---+----+------------------+---------------------+
|1  |mahi|data, dataengineer|[data,  dataengineer]|
|2  |sam |IT, software      |[IT,  software]      |
+---+----+------------------+---------------------+



In [0]:
dfx=dfx.withColumns({'dept': col('dept_role_list')[0],\
                    'role': col('dept_role_list')[1]})

In [0]:
dfx.show()

+---+----+------------------+--------------------+-------------+----+
| id|name|         dept_role|      dept_role_list|         role|dept|
+---+----+------------------+--------------------+-------------+----+
|  1|mahi|data, dataengineer|[data,  dataengin...| dataengineer|data|
|  2| sam|      IT, software|     [IT,  software]|     software|  IT|
+---+----+------------------+--------------------+-------------+----+



In [0]:
a =dfx.drop('dept_role')

In [0]:
a.show()

+---+----+--------------------+-------------+----+
| id|name|      dept_role_list|         role|dept|
+---+----+--------------------+-------------+----+
|  1|mahi|[data,  dataengin...| dataengineer|data|
|  2| sam|     [IT,  software]|     software|  IT|
+---+----+--------------------+-------------+----+



In [0]:
help(col)

Help on function col in module pyspark.sql.functions:

col(col: str) -> pyspark.sql.column.Column
    Returns a :class:`~pyspark.sql.Column` based on the given column name.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    col : str
        the name for the column
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        the corresponding column instance.
    
    Examples
    --------
    >>> col('x')
    Column<'x'>
    >>> column('x')
    Column<'x'>



In [0]:
help(lit)

Help on function lit in module pyspark.sql.functions:

lit(col: Any) -> pyspark.sql.column.Column
    Creates a :class:`~pyspark.sql.Column` of literal value.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    col : :class:`~pyspark.sql.Column` or Python primitive type.
        the value to make it as a PySpark literal. If a column is passed,
        it returns the column as is.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        the literal instance.
    
    Examples
    --------
    >>> df = spark.range(1)
    >>> df.select(lit(5).alias('height'), df.id).show()
    +------+---+
    |height| id|
    +------+---+
    |     5|  0|
    +------+---+

