### Select() Function
It is a transformation function used to return the new Dataframe

1. It return only the selected columns
2. Can specify multiple columns
3. Select specifies columns and/or rename

what is the difference in withColumn() ?
1. used for add / modify columns
2. Return all columns
3. Can add only 1 column at a time

In [27]:
# creating spark session and df

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("learning").getOrCreate()


In [2]:
# Data
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]
# Column names
columns = ["firstname","lastname","country","state"]

# Create DataFrame
df = spark.createDataFrame(data = data, schema = columns)

In [3]:
df.show(truncate=False)

                                                                                

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+



In [20]:
# list of ways to select columns list of columns

df.select("firstname").show()
df.select(df.firstname,df.lastname).show()
df.select(df["firstname"]).show()

# using col function

from pyspark.sql.functions import col
df.select(col("firstname")).show()

#using regex function
df.select(df.colRegex("`^.*name*`")).show()



+---------+
|firstname|
+---------+
|    James|
|  Michael|
|   Robert|
|    Maria|
+---------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+
|firstname|
+---------+
|    James|
|  Michael|
|   Robert|
|    Maria|
+---------+

+---------+
|firstname|
+---------+
|    James|
|  Michael|
|   Robert|
|    Maria|
+---------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



                                                                                

In [23]:
# selecting all columns

df.select(*columns).show()
df.select("*").show()
df.select([col for col in df.columns]).show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



                                                                                

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



                                                                                

In [26]:
# selecting columns based on index
df.select(df.columns).show()
df.select(df.columns[2:]).show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+

+-------+-----+
|country|state|
+-------+-----+
|    USA|   CA|
|    USA|   NY|
|    USA|   CA|
|    USA|   FL|
+-------+-----+



In [32]:
# select in complex nested columns
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

comp_data = [(("padmanabhan","","s"),"chennai",5000),
             (("Deva","kumar","k"),"trichy",1000)]

schema = StructType([StructField("name",StructType([
                                        StructField("firstName",StringType(),False),
                                        StructField("middleName",StringType(),False),
                                        StructField("lastName",StringType(),False)])),
                     StructField("location",StringType(),True),
                     StructField("Salary",IntegerType(),False)
                    ])

df = spark.createDataFrame(comp_data,schema=schema)
df.show()

                                                                                

+------------------+--------+------+
|              name|location|Salary|
+------------------+--------+------+
|{padmanabhan, , s}| chennai|  5000|
|  {Deva, kumar, k}|  trichy|  1000|
+------------------+--------+------+



                                                                                

In [43]:
df.select("name.firstName").show()
df.select("name.*").show()

+-----------+
|  firstName|
+-----------+
|padmanabhan|
|       Deva|
+-----------+

+-----------+----------+--------+
|  firstName|middleName|lastName|
+-----------+----------+--------+
|padmanabhan|          |       s|
|       Deva|     kumar|       k|
+-----------+----------+--------+

