Column class:
1. It represent single column in the dataframe
2. Provide functions to manipulate columns and rows
3. Can be used with filter() transformations to filter the dataframe rows
4. Can be used with pyspark.sql.functions which take column object and return column type

In [3]:
# creating spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("learning").getOrCreate()

24/11/12 06:25:58 WARN Utils: Your hostname, padmanabhan-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/11/12 06:25:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/12 06:26:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# creating col class object

from pyspark.sql.functions import lit
colObj = lit("India")
print(colObj)

Column<'India'>


In [6]:
# Multiple ways to access df columns

# df.column
# df["column"]
# df["`column`"]
# df.select(col("column"))

In [7]:
# creating complex data using Row class, we can also use StructField and StructType 
from pyspark.sql import Row

data = [Row(name=Row(firstName="Padmanabhan",lastName="s"),location="chennai",salary=1000),
        Row(name=Row(firstName="karthik",lastName="k"),location="thanjavur",salary=5000)]

df = spark.createDataFrame(data)
df.show()

                                                                                

+----------------+---------+------+
|            name| location|salary|
+----------------+---------+------+
|{Padmanabhan, s}|  chennai|  1000|
|    {karthik, k}|thanjavur|  5000|
+----------------+---------+------+



In [16]:
# ways to select the nested props
from pyspark.sql.functions import col
df.select(df.name.firstName).show()
df.select(col("name.lastName")).show()
df.select(df.columns).show()
df.select("*").show()
df.select("name.*").show()

+--------------+
|name.firstName|
+--------------+
|   Padmanabhan|
|       karthik|
+--------------+

+--------+
|lastName|
+--------+
|       s|
|       k|
+--------+

+----------------+---------+------+
|            name| location|salary|
+----------------+---------+------+
|{Padmanabhan, s}|  chennai|  1000|
|    {karthik, k}|thanjavur|  5000|
+----------------+---------+------+

+----------------+---------+------+
|            name| location|salary|
+----------------+---------+------+
|{Padmanabhan, s}|  chennai|  1000|
|    {karthik, k}|thanjavur|  5000|
+----------------+---------+------+

+-----------+--------+
|  firstName|lastName|
+-----------+--------+
|Padmanabhan|       s|
|    karthik|       k|
+-----------+--------+



In [9]:

df.select(df["name.firstName"]).show()

+-----------+
|  firstName|
+-----------+
|Padmanabhan|
|    karthik|
+-----------+

+----------------+---------+------+------------+
|            name| location|salary|modifiedName|
+----------------+---------+------+------------+
|{Padmanabhan, s}|  chennai|  1000|        NULL|
|    {karthik, k}|thanjavur|  5000|        NULL|
+----------------+---------+------+------------+



In [10]:
# creating new column

from pyspark.sql.functions import lit,col,concat_ws
df.withColumn("modifiedName",concat_ws(",",col("name.firstName"),col("name.lastName"))).show()

                                                                                

+----------------+---------+------+-------------+
|            name| location|salary| modifiedName|
+----------------+---------+------+-------------+
|{Padmanabhan, s}|  chennai|  1000|Padmanabhan,s|
|    {karthik, k}|thanjavur|  5000|    karthik,k|
+----------------+---------+------+-------------+



In [26]:
# arithmetics operations on the df columns

## pyspark wont support + of string unlike the native string. Use concat or concat_ws
df.select(df["name.firstName"]+df["name.lastName"]).show()  # o/p null

## Question : create a new column for expected salary and add default 10k

from pyspark.sql.functions import col

df2 = df.withColumn("modified-salary",lit(10000))
df2.show()

## Now modify the column name to expected salary

df2 = df2.withColumnRenamed("modified-salary","expected-salary")
df2.show()

# arithmetic operations on the salary and expected salary columns

df2.select((df2["expected-salary"] - df2["salary"]).alias("salary-diff")).show()
# supported airthmetic operators: +, - %, /, *, <,>, ==

+---------------------------------------------------------+
|(name.firstName AS firstName + name.lastName AS lastName)|
+---------------------------------------------------------+
|                                                     NULL|
|                                                     NULL|
+---------------------------------------------------------+

+----------------+---------+------+---------------+
|            name| location|salary|modified-salary|
+----------------+---------+------+---------------+
|{Padmanabhan, s}|  chennai|  1000|          10000|
|    {karthik, k}|thanjavur|  5000|          10000|
+----------------+---------+------+---------------+

+----------------+---------+------+---------------+
|            name| location|salary|expected-salary|
+----------------+---------+------+---------------+
|{Padmanabhan, s}|  chennai|  1000|          10000|
|    {karthik, k}|thanjavur|  5000|          10000|
+----------------+---------+------+---------------+

+-----------+

### commonly used column functions


In [33]:
# alias, name same like alias

df.select(df.salary.alias("current-salary")).show()

df.select(df.salary.name("current-salary")).show()

+--------------+
|current-salary|
+--------------+
|          1000|
|          5000|
+--------------+

+--------------+
|current-salary|
+--------------+
|          1000|
|          5000|
+--------------+



In [31]:
# asc, desc
df2.sort(df["salary"].asc()).show()
df2.sort(df.salary.desc()).show()

                                                                                

+----------------+---------+------+---------------+
|            name| location|salary|expected-salary|
+----------------+---------+------+---------------+
|{Padmanabhan, s}|  chennai|  1000|          10000|
|    {karthik, k}|thanjavur|  5000|          10000|
+----------------+---------+------+---------------+

+----------------+---------+------+---------------+
|            name| location|salary|expected-salary|
+----------------+---------+------+---------------+
|    {karthik, k}|thanjavur|  5000|          10000|
|{Padmanabhan, s}|  chennai|  1000|          10000|
+----------------+---------+------+---------------+



In [None]:
# asc , desc handling null
