In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("sparkdev-tutorial").getOrCreate()

23/03/19 15:31:31 WARN Utils: Your hostname, Pavans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.29.143 instead (on interface en0)
23/03/19 15:31:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/19 15:31:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [102]:
# define the structure to the data frame
schema = StructType([
    StructField(name="FirstName", dataType=StringType(), nullable=False),
    StructField(name="LastName", dataType=StringType(), nullable=False),
    StructField(name="Age", dataType=IntegerType(), nullable=False),
    StructField(name="Place", dataType=StringType(), nullable=False),
    StructField(name="Salary", dataType=LongType(), nullable=False),
    StructField(name="Department", dataType=StringType(), nullable=False),
    StructField(name="Technologies", dataType=ArrayType(elementType=StringType()), nullable=False),
])

# create the data rows as per the schema defined
rows = [
    Row("Pavan","Mantha",36,"Hyderabad",273567,"SPS",["java","spring boot","data science","react","node", "Terraform"]),
    Row("Arun","Boppudi",36,"Guntur",303567,"Aero",["java","spring boot","cloud","react","node", "druid", "kafka"]),
    Row("Ravi","Vadlamani",26,"Visakapatnam",213567,"Aero",["express","data structures","react"]),
    Row("Mahender","M",21,"Hyderabad",153567,"Aero",["java","spring boot","express","react","node"]),
    Row("Manoj","Manoj",21,"Guntur",183567,"Aero",["express","react"]),
    Row("Manoj","Velecheti",21,"Visakapatnam",223567,"Aero",["java","spring boot","express","react"]),
    Row("Phani","Vadlmani",21,"Anakapalli",283467,"AppDev",["express","react", "Docker", "AWS"]),
    Row("Deepak","Mantha",33,"Chennai",303467,"DAE",["C","C++", "Python", "Physics", "Mathematics"]),
]

parallel_rows = spark.sparkContext.parallelize(rows)

# createDataFrame is used to create dataframe manually
df = spark.createDataFrame(parallel_rows, schema, verifySchema=True)

## Use of PySpark Select() function

In [4]:
df.select("*").show(truncate=False)

                                                                                

+---------+---------+---+------------+------+----------+---------------------------------------------------------+
|FirstName|LastName |Age|Place       |Salary|Department|Technologies                                             |
+---------+---------+---+------------+------+----------+---------------------------------------------------------+
|Pavan    |Mantha   |36 |Hyderabad   |273567|SPS       |[java, spring boot, data science, react, node, Terraform]|
|Arun     |Boppudi  |36 |Guntur      |303567|Aero      |[java, spring boot, cloud, react, node, druid, kafka]    |
|Ravi     |Vadlamani|26 |Visakapatnam|213567|Aero      |[express, data structures, react]                        |
|Mahender |M        |21 |Hyderabad   |153567|Aero      |[java, spring boot, express, react, node]                |
|Manoj    |Manoj    |21 |Guntur      |183567|Aero      |[express, react]                                         |
|Manoj    |Velecheti|21 |Visakapatnam|223567|Aero      |[java, spring boot, expr

In [5]:
df.select(col("FirstName"), col("Salary"), col("Technologies")).show(truncate=False)

+---------+------+---------------------------------------------------------+
|FirstName|Salary|Technologies                                             |
+---------+------+---------------------------------------------------------+
|Pavan    |273567|[java, spring boot, data science, react, node, Terraform]|
|Arun     |303567|[java, spring boot, cloud, react, node, druid, kafka]    |
|Ravi     |213567|[express, data structures, react]                        |
|Mahender |153567|[java, spring boot, express, react, node]                |
|Manoj    |183567|[express, react]                                         |
|Manoj    |223567|[java, spring boot, express, react]                      |
|Phani    |283467|[express, react, Docker, AWS]                            |
|Deepak   |303467|[C, C++, Python, Physics, Mathematics]                   |
+---------+------+---------------------------------------------------------+



In [6]:
data = [
    (("Pavan","Kumar","Mantha"),"TS","M"),
    (("Divyasree",None,"Gottipati"),"TS","F"),
    (("Ramarao",None,"Dandamudi"),"AP","M"),
    (("Snigdha","","Kantamaneni"),"TS","F"),
    (("Mahender",None,None),"TS","M"),
    (("Ramu",None,"Nagisetty"),"AP","M"),
    (("Dhawan",None,"Rachakonda"),"TS","M")
]

schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('state', StringType(), True),
    StructField('gender', StringType(), True)
])

df2 = spark.createDataFrame(data=data, schema=schema)

In [7]:
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------------+-----+------+
|name                        |state|gender|
+----------------------------+-----+------+
|{Pavan, Kumar, Mantha}      |TS   |M     |
|{Divyasree, null, Gottipati}|TS   |F     |
|{Ramarao, null, Dandamudi}  |AP   |M     |
|{Snigdha, , Kantamaneni}    |TS   |F     |
|{Mahender, null, null}      |TS   |M     |
|{Ramu, null, Nagisetty}     |AP   |M     |
|{Dhawan, null, Rachakonda}  |TS   |M     |
+----------------------------+-----+------+



In [8]:
df2.select(col("name")).show(truncate=False)

+----------------------------+
|name                        |
+----------------------------+
|{Pavan, Kumar, Mantha}      |
|{Divyasree, null, Gottipati}|
|{Ramarao, null, Dandamudi}  |
|{Snigdha, , Kantamaneni}    |
|{Mahender, null, null}      |
|{Ramu, null, Nagisetty}     |
|{Dhawan, null, Rachakonda}  |
+----------------------------+



In [9]:
df2.select("name.lastname").show(truncate=False)

+-----------+
|lastname   |
+-----------+
|Mantha     |
|Gottipati  |
|Dandamudi  |
|Kantamaneni|
|null       |
|Nagisetty  |
|Rachakonda |
+-----------+



## Use of PySpark Collect() function

In [10]:
dfCollected = df.collect()
print(dfCollected)

[Row(FirstName='Pavan', LastName='Mantha', Age=36, Place='Hyderabad', Salary=273567, Department='SPS', Technologies=['java', 'spring boot', 'data science', 'react', 'node', 'Terraform']), Row(FirstName='Arun', LastName='Boppudi', Age=36, Place='Guntur', Salary=303567, Department='Aero', Technologies=['java', 'spring boot', 'cloud', 'react', 'node', 'druid', 'kafka']), Row(FirstName='Ravi', LastName='Vadlamani', Age=26, Place='Visakapatnam', Salary=213567, Department='Aero', Technologies=['express', 'data structures', 'react']), Row(FirstName='Mahender', LastName='M', Age=21, Place='Hyderabad', Salary=153567, Department='Aero', Technologies=['java', 'spring boot', 'express', 'react', 'node']), Row(FirstName='Manoj', LastName='Manoj', Age=21, Place='Guntur', Salary=183567, Department='Aero', Technologies=['express', 'react']), Row(FirstName='Manoj', LastName='Velecheti', Age=21, Place='Visakapatnam', Salary=223567, Department='Aero', Technologies=['java', 'spring boot', 'express', 'react

In [11]:
for row in dfCollected:
    print(row["FirstName"] +" => "+ row["Place"])

Pavan => Hyderabad
Arun => Guntur
Ravi => Visakapatnam
Mahender => Hyderabad
Manoj => Guntur
Manoj => Visakapatnam
Phani => Anakapalli
Deepak => Chennai


#### Note: Select() is a transformation function while Collect() is a action function. Select() gives a new DF with seleceted columns while Collect() gives the entire DF. most of the time we should avoid using Collect()

## use of PySpark withColumn() function

In [12]:
df = df.withColumn("modifiedSalary", col("salary")*2)
df.show(truncate=True)

+---------+---------+---+------------+------+----------+--------------------+--------------+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|modifiedSalary|
+---------+---------+---+------------+------+----------+--------------------+--------------+
|    Pavan|   Mantha| 36|   Hyderabad|273567|       SPS|[java, spring boo...|        547134|
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|        607134|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|        427134|
| Mahender|        M| 21|   Hyderabad|153567|      Aero|[java, spring boo...|        307134|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|        367134|
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|        447134|
|    Phani| Vadlmani| 21|  Anakapalli|283467|    AppDev|[express, react, ...|        566934|
|   Deepak|   Mantha| 33|     Chennai|303467|       DAE|[C, C++, Pytho

In [13]:
df = df.withColumn("State", lit("AP"))
df.show()

+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|modifiedSalary|State|
+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|    Pavan|   Mantha| 36|   Hyderabad|273567|       SPS|[java, spring boo...|        547134|   AP|
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|        607134|   AP|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|        427134|   AP|
| Mahender|        M| 21|   Hyderabad|153567|      Aero|[java, spring boo...|        307134|   AP|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|        367134|   AP|
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|        447134|   AP|
|    Phani| Vadlmani| 21|  Anakapalli|283467|    AppDev|[express, react, ...|        566934|   AP|
|   Deepak

## use of when() function in pyspark

In [14]:
df = df.withColumn("State", when(df.Place == "Chennai", "TN").otherwise(when(df.Place == "Hyderabad", "TS").otherwise("AP")))
df.show()

+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|modifiedSalary|State|
+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|    Pavan|   Mantha| 36|   Hyderabad|273567|       SPS|[java, spring boo...|        547134|   TS|
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|        607134|   AP|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|        427134|   AP|
| Mahender|        M| 21|   Hyderabad|153567|      Aero|[java, spring boo...|        307134|   TS|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|        367134|   AP|
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|        447134|   AP|
|    Phani| Vadlmani| 21|  Anakapalli|283467|    AppDev|[express, react, ...|        566934|   AP|
|   Deepak

## use of where() & filter() function in pyspark

In [15]:
df.filter(df["State"] == "AP").show(truncate=True)

+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|modifiedSalary|State|
+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|        607134|   AP|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|        427134|   AP|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|        367134|   AP|
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|        447134|   AP|
|    Phani| Vadlmani| 21|  Anakapalli|283467|    AppDev|[express, react, ...|        566934|   AP|
+---------+---------+---+------------+------+----------+--------------------+--------------+-----+



In [16]:
# Using equals condition
df2.filter(df2.gender == "F").show(truncate=False)

# Using equals condition
df2.filter(df2.gender != "F").show(truncate=False)

+----------------------------+-----+------+
|name                        |state|gender|
+----------------------------+-----+------+
|{Divyasree, null, Gottipati}|TS   |F     |
|{Snigdha, , Kantamaneni}    |TS   |F     |
+----------------------------+-----+------+

+--------------------------+-----+------+
|name                      |state|gender|
+--------------------------+-----+------+
|{Pavan, Kumar, Mantha}    |TS   |M     |
|{Ramarao, null, Dandamudi}|AP   |M     |
|{Mahender, null, null}    |TS   |M     |
|{Ramu, null, Nagisetty}   |AP   |M     |
|{Dhawan, null, Rachakonda}|TS   |M     |
+--------------------------+-----+------+



In [17]:
df.filter(array_contains(df.Technologies, "druid")).show(truncate=False)

+---------+--------+---+------+------+----------+-----------------------------------------------------+--------------+-----+
|FirstName|LastName|Age|Place |Salary|Department|Technologies                                         |modifiedSalary|State|
+---------+--------+---+------+------+----------+-----------------------------------------------------+--------------+-----+
|Arun     |Boppudi |36 |Guntur|303567|Aero      |[java, spring boot, cloud, react, node, druid, kafka]|607134        |AP   |
+---------+--------+---+------+------+----------+-----------------------------------------------------+--------------+-----+



In [18]:
df.where(col("Age") > 30).show(truncate=True)

+---------+--------+---+---------+------+----------+--------------------+--------------+-----+
|FirstName|LastName|Age|    Place|Salary|Department|        Technologies|modifiedSalary|State|
+---------+--------+---+---------+------+----------+--------------------+--------------+-----+
|    Pavan|  Mantha| 36|Hyderabad|273567|       SPS|[java, spring boo...|        547134|   TS|
|     Arun| Boppudi| 36|   Guntur|303567|      Aero|[java, spring boo...|        607134|   AP|
|   Deepak|  Mantha| 33|  Chennai|303467|       DAE|[C, C++, Python, ...|        606934|   TN|
+---------+--------+---+---------+------+----------+--------------------+--------------+-----+



#### Note: where() and filter() function both are used to filter the results from the dataset

## use of orderBy() and sort() in pyspark

In [19]:
df.sort("Age", "Department").show()

+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|modifiedSalary|State|
+---------+---------+---+------------+------+----------+--------------------+--------------+-----+
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|        447134|   AP|
| Mahender|        M| 21|   Hyderabad|153567|      Aero|[java, spring boo...|        307134|   TS|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|        367134|   AP|
|    Phani| Vadlmani| 21|  Anakapalli|283467|    AppDev|[express, react, ...|        566934|   AP|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|        427134|   AP|
|   Deepak|   Mantha| 33|     Chennai|303467|       DAE|[C, C++, Python, ...|        606934|   TN|
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|        607134|   AP|
|    Pavan

In [104]:
df.sort(col("Salary").asc()).show(truncate=False)

+---------+---------+---+------------+------+----------+---------------------------------------------------------+
|FirstName|LastName |Age|Place       |Salary|Department|Technologies                                             |
+---------+---------+---+------------+------+----------+---------------------------------------------------------+
|Mahender |M        |21 |Hyderabad   |153567|Aero      |[java, spring boot, express, react, node]                |
|Manoj    |Manoj    |21 |Guntur      |183567|Aero      |[express, react]                                         |
|Ravi     |Vadlamani|26 |Visakapatnam|213567|Aero      |[express, data structures, react]                        |
|Manoj    |Velecheti|21 |Visakapatnam|223567|Aero      |[java, spring boot, express, react]                      |
|Pavan    |Mantha   |36 |Hyderabad   |273567|SPS       |[java, spring boot, data science, react, node, Terraform]|
|Phani    |Vadlmani |21 |Anakapalli  |283467|AppDev    |[express, react, Docker,

In [21]:
df2.orderBy(col("state")).show(truncate=False)

+----------------------------+-----+------+
|name                        |state|gender|
+----------------------------+-----+------+
|{Ramarao, null, Dandamudi}  |AP   |M     |
|{Ramu, null, Nagisetty}     |AP   |M     |
|{Mahender, null, null}      |TS   |M     |
|{Divyasree, null, Gottipati}|TS   |F     |
|{Snigdha, , Kantamaneni}    |TS   |F     |
|{Dhawan, null, Rachakonda}  |TS   |M     |
|{Pavan, Kumar, Mantha}      |TS   |M     |
+----------------------------+-----+------+



In [22]:
df2.orderBy(col("state"), col("name.firstname")).show(truncate=False)

+----------------------------+-----+------+
|name                        |state|gender|
+----------------------------+-----+------+
|{Ramarao, null, Dandamudi}  |AP   |M     |
|{Ramu, null, Nagisetty}     |AP   |M     |
|{Dhawan, null, Rachakonda}  |TS   |M     |
|{Divyasree, null, Gottipati}|TS   |F     |
|{Mahender, null, null}      |TS   |M     |
|{Pavan, Kumar, Mantha}      |TS   |M     |
|{Snigdha, , Kantamaneni}    |TS   |F     |
+----------------------------+-----+------+



## use of groupBy() function in pyspark

In [23]:
df.groupBy(col("Department")).sum("Salary").show()



+----------+-----------+
|Department|sum(Salary)|
+----------+-----------+
|       SPS|     273567|
|      Aero|    1077835|
|    AppDev|     283467|
|       DAE|     303467|
+----------+-----------+



                                                                                

In [24]:
df.groupBy(col("Department")).max("Salary").show()

+----------+-----------+
|Department|max(Salary)|
+----------+-----------+
|       SPS|     273567|
|      Aero|     303567|
|    AppDev|     283467|
|       DAE|     303467|
+----------+-----------+



In [25]:
df = df.withColumn("Bonus", lit(2000))
df.show()

+---------+---------+---+------------+------+----------+--------------------+--------------+-----+-----+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|modifiedSalary|State|Bonus|
+---------+---------+---+------------+------+----------+--------------------+--------------+-----+-----+
|    Pavan|   Mantha| 36|   Hyderabad|273567|       SPS|[java, spring boo...|        547134|   TS| 2000|
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|        607134|   AP| 2000|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|        427134|   AP| 2000|
| Mahender|        M| 21|   Hyderabad|153567|      Aero|[java, spring boo...|        307134|   TS| 2000|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|        367134|   AP| 2000|
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|        447134|   AP| 2000|
|    Phani| Vadlmani| 21|  Anakapalli|283467|    AppDev

In [26]:
#GroupBy on multiple columns
df.groupBy("Department","Place").sum("Salary","Bonus").show()

+----------+------------+-----------+----------+
|Department|       Place|sum(Salary)|sum(Bonus)|
+----------+------------+-----------+----------+
|       SPS|   Hyderabad|     273567|      2000|
|      Aero|      Guntur|     487134|      4000|
|      Aero|Visakapatnam|     437134|      4000|
|      Aero|   Hyderabad|     153567|      2000|
|    AppDev|  Anakapalli|     283467|      2000|
|       DAE|     Chennai|     303467|      2000|
+----------+------------+-----------+----------+



## use of join() function in pyspark

#### will demonstrate with a classic example of employee and department dataframe

In [34]:
emp = [(1,"Pavan Mantha",-1,"2018","10","M",3000),
       (2,"Ramarao Dandamudi",1,"2010","20","M",4000),
       (3,"Mahender",1,"2010","10","M",1000),
       (4,"Snigdha Kantamaneni",3,"2005","10","F",2000),
       (5,"Divyasree Gothipati",3,"2010","40","F",1500),
       (6,"Ramu Nagisetty",1,"2010","50","M",2800),
       (7,"Dhawan Rachakonda",2,"2010","50","M",3600)
       ]
empColumns = ["emp_id","name","manager_id","year_joined",
              "dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

dept = [("Finance",10),
        ("Marketing",20),
        ("Sales",30),
        ("IT",40)
        ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manager_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+-------------------+----------+-----------+-------+------+------+
|emp_id|name               |manager_id|year_joined|dept_id|gender|salary|
+------+-------------------+----------+-----------+-------+------+------+
|1     |Pavan Mantha       |-1        |2018       |10     |M     |3000  |
|2     |Ramarao Dandamudi  |1         |2010       |20     |M     |4000  |
|3     |Mahender           |1         |2010       |10     |M     |1000  |
|4     |Snigdha Kantamaneni|3         |2005       |10     |F     |2000  |
|5     |Divyasree Gothipati|3         |2010       |40     |F     |1500  |
|6     |Ramu Nagisetty     |1         |2010       |50     |M     |2800  |
|7     |Dhawan Rachakonda  |2         |2010       |50     |M

In [29]:
empDF.join(deptDF, empDF.dept_id == deptDF.dept_id, "inner").show(truncate=True)

+------+-------------------+----------+-----------+-------+------+------+---------+-------+
|emp_id|               name|manager_id|year_joined|dept_id|gender|salary|dept_name|dept_id|
+------+-------------------+----------+-----------+-------+------+------+---------+-------+
|     1|       Pavan Mantha|        -1|       2018|     10|     M|  3000|  Finance|     10|
|     3|           Mahender|         1|       2010|     10|     M|  1000|  Finance|     10|
|     4|Snigdha Kantamaneni|         3|       2005|     10|     F|  2000|  Finance|     10|
|     2|  Ramarao Dandamudi|         1|       2010|     20|     M|  4000|Marketing|     20|
|     5|Divyasree Gothipati|         3|       2010|     40|     F|  1500|       IT|     40|
+------+-------------------+----------+-----------+-------+------+------+---------+-------+



In [30]:
empDF.join(deptDF,empDF.dept_id ==  deptDF.dept_id,"outer").show(truncate=False)
empDF.join(deptDF,empDF.dept_id ==  deptDF.dept_id,"full").show(truncate=False)
empDF.join(deptDF,empDF.dept_id ==  deptDF.dept_id,"fullouter").show(truncate=False)

+------+-------------------+----------+-----------+-------+------+------+---------+-------+
|emp_id|name               |manager_id|year_joined|dept_id|gender|salary|dept_name|dept_id|
+------+-------------------+----------+-----------+-------+------+------+---------+-------+
|1     |Pavan Mantha       |-1        |2018       |10     |M     |3000  |Finance  |10     |
|3     |Mahender           |1         |2010       |10     |M     |1000  |Finance  |10     |
|4     |Snigdha Kantamaneni|3         |2005       |10     |F     |2000  |Finance  |10     |
|2     |Ramarao Dandamudi  |1         |2010       |20     |M     |4000  |Marketing|20     |
|null  |null               |null      |null       |null   |null  |null  |Sales    |30     |
|5     |Divyasree Gothipati|3         |2010       |40     |F     |1500  |IT       |40     |
|6     |Ramu Nagisetty     |1         |2010       |50     |M     |2800  |null     |null   |
|7     |Dhawan Rachakonda  |2         |2010       |50     |M     |3600  |null   

## union() functions in PySpark SQL

In [36]:
emp = [(8,"Ashwini Vangala",-1,"2018","10","M",3000),
       (9,"Akhil Debral",8,"2010","20","M",4000),
       (10,"Nishant Sharma",8,"2010","10","M",1000),
       (11,"Ramai V",10,"2005","10","F",2000),
       (12,"Ravi Teja",10,"2010","40","M",1500),
       (13,"Rajib",-1,"2010","50","M",2800),
       (14,"Utkarsh Upadyay",9,"2010","50","M",3600)
       ]
empColumns = ["emp_id","name","manager_id","year_joined",
              "dept_id","gender","salary"]

empDF2 = spark.createDataFrame(data=emp, schema = empColumns)
empDF2.printSchema()
empDF2.show(truncate=False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manager_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+---------------+----------+-----------+-------+------+------+
|emp_id|name           |manager_id|year_joined|dept_id|gender|salary|
+------+---------------+----------+-----------+-------+------+------+
|8     |Ashwini Vangala|-1        |2018       |10     |M     |3000  |
|9     |Akhil Debral   |8         |2010       |20     |M     |4000  |
|10    |Nishant Sharma |8         |2010       |10     |M     |1000  |
|11    |Ramai V        |10        |2005       |10     |F     |2000  |
|12    |Ravi Teja      |10        |2010       |40     |M     |1500  |
|13    |Rajib          |-1        |2010       |50     |M     |2800  |
|14    |Utkarsh Upadyay|9         |2010       |50     |M     |3600  |
+------+---------------+--

In [38]:
unionDF = empDF.union(empDF2)
unionDF.show(truncate=False)

+------+-------------------+----------+-----------+-------+------+------+
|emp_id|name               |manager_id|year_joined|dept_id|gender|salary|
+------+-------------------+----------+-----------+-------+------+------+
|1     |Pavan Mantha       |-1        |2018       |10     |M     |3000  |
|2     |Ramarao Dandamudi  |1         |2010       |20     |M     |4000  |
|3     |Mahender           |1         |2010       |10     |M     |1000  |
|4     |Snigdha Kantamaneni|3         |2005       |10     |F     |2000  |
|5     |Divyasree Gothipati|3         |2010       |40     |F     |1500  |
|6     |Ramu Nagisetty     |1         |2010       |50     |M     |2800  |
|7     |Dhawan Rachakonda  |2         |2010       |50     |M     |3600  |
|8     |Ashwini Vangala    |-1        |2018       |10     |M     |3000  |
|9     |Akhil Debral       |8         |2010       |20     |M     |4000  |
|10    |Nishant Sharma     |8         |2010       |10     |M     |1000  |
|11    |Ramai V            |10        

## usage of map() function in PySpark

In [69]:
# Refering columns by index.
rdd2=unionDF.rdd.map(lambda x:(x[0],x[1].upper(),x[2],x[3],x[4],x[5],x[6]))
df2=rdd2.toDF(schema=empColumns)
df2.show()

# Referring Column Names
rdd2=unionDF.rdd.map(lambda x:(x["emp_id"],x["name"],x["manager_id"],x["year_joined"],x["dept_id"],x["gender"],x["salary"]*2))
df2=rdd2.toDF(schema=empColumns)
df2.show()

+------+-------------------+----------+-----------+-------+------+------+
|emp_id|               name|manager_id|year_joined|dept_id|gender|salary|
+------+-------------------+----------+-----------+-------+------+------+
|     1|       PAVAN MANTHA|        -1|       2018|     10|     M|  3000|
|     2|  RAMARAO DANDAMUDI|         1|       2010|     20|     M|  4000|
|     3|           MAHENDER|         1|       2010|     10|     M|  1000|
|     4|SNIGDHA KANTAMANENI|         3|       2005|     10|     F|  2000|
|     5|DIVYASREE GOTHIPATI|         3|       2010|     40|     F|  1500|
|     6|     RAMU NAGISETTY|         1|       2010|     50|     M|  2800|
|     7|  DHAWAN RACHAKONDA|         2|       2010|     50|     M|  3600|
|     8|    ASHWINI VANGALA|        -1|       2018|     10|     M|  3000|
|     9|       AKHIL DEBRAL|         8|       2010|     20|     M|  4000|
|    10|     NISHANT SHARMA|         8|       2010|     10|     M|  1000|
|    11|            RAMAI V|        10

## MapType(), map_keys(), map_values(), explode() functions in pyspark

In [74]:
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),BooleanType()),True)
])

In [75]:
spark = SparkSession.builder.appName('sparkdev').getOrCreate()
dataDictionary = [
    ('Pavan Mantha',{'SparkSQL':True,'Snowflake':False}),
    ('Arun Boppudi',{'SparkSQL':True,'Snowflake':True}),
    ('Ravi Vadlamani',{'SparkSQL':False,'Snowflake':False}),
    ('Ramu Nagisetty',{'SparkSQL':True,'Snowflake':False}),
    ('Ramarao Dandamudi',{'SparkSQL':False,'Snowflake':False})
]
df = spark.createDataFrame(data=dataDictionary, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: boolean (valueContainsNull = true)

+-----------------+---------------------------------------+
|name             |properties                             |
+-----------------+---------------------------------------+
|Pavan Mantha     |{SparkSQL -> true, Snowflake -> false} |
|Arun Boppudi     |{SparkSQL -> true, Snowflake -> true}  |
|Ravi Vadlamani   |{SparkSQL -> false, Snowflake -> false}|
|Ramu Nagisetty   |{SparkSQL -> true, Snowflake -> false} |
|Ramarao Dandamudi|{SparkSQL -> false, Snowflake -> false}|
+-----------------+---------------------------------------+



In [79]:
df2 = df.rdd.map(lambda x: (x["name"], x.properties["SparkSQL"], x.properties["Snowflake"])).toDF(["name","SparkSQL","SnowFlake"])
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- SparkSQL: boolean (nullable = true)
 |-- SnowFlake: boolean (nullable = true)

+-----------------+--------+---------+
|name             |SparkSQL|SnowFlake|
+-----------------+--------+---------+
|Pavan Mantha     |true    |false    |
|Arun Boppudi     |true    |true     |
|Ravi Vadlamani   |false   |false    |
|Ramu Nagisetty   |true    |false    |
|Ramarao Dandamudi|false   |false    |
+-----------------+--------+---------+



In [88]:
df2.select("*").where(df2["SparkSQL"] == True).where(df2["Snowflake"] == False).show(truncate=False)

+--------------+--------+---------+
|name          |SparkSQL|SnowFlake|
+--------------+--------+---------+
|Pavan Mantha  |true    |false    |
|Ramu Nagisetty|true    |false    |
+--------------+--------+---------+



In [89]:
df.select(df.name, explode(df.properties)).show()

+-----------------+---------+-----+
|             name|      key|value|
+-----------------+---------+-----+
|     Pavan Mantha| SparkSQL| true|
|     Pavan Mantha|Snowflake|false|
|     Arun Boppudi| SparkSQL| true|
|     Arun Boppudi|Snowflake| true|
|   Ravi Vadlamani| SparkSQL|false|
|   Ravi Vadlamani|Snowflake|false|
|   Ramu Nagisetty| SparkSQL| true|
|   Ramu Nagisetty|Snowflake|false|
|Ramarao Dandamudi| SparkSQL|false|
|Ramarao Dandamudi|Snowflake|false|
+-----------------+---------+-----+



In [91]:
df.select(df.name, map_keys(df.properties)).show(truncate=False)

+-----------------+---------------------+
|name             |map_keys(properties) |
+-----------------+---------------------+
|Pavan Mantha     |[SparkSQL, Snowflake]|
|Arun Boppudi     |[SparkSQL, Snowflake]|
|Ravi Vadlamani   |[SparkSQL, Snowflake]|
|Ramu Nagisetty   |[SparkSQL, Snowflake]|
|Ramarao Dandamudi|[SparkSQL, Snowflake]|
+-----------------+---------------------+



In [92]:
df.select(df.name, map_values(df.properties)).show(truncate=False)

+-----------------+----------------------+
|name             |map_values(properties)|
+-----------------+----------------------+
|Pavan Mantha     |[true, false]         |
|Arun Boppudi     |[true, true]          |
|Ravi Vadlamani   |[false, false]        |
|Ramu Nagisetty   |[true, false]         |
|Ramarao Dandamudi|[false, false]        |
+-----------------+----------------------+



## use of date functions in pyspark

In [93]:
df=spark.createDataFrame(
    data = [ ("1","2023-03-19 08:57:00.000")],
    schema=["id","timestamp"])
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [95]:
#Timestamp String to DateType
df.withColumn("timestamp_datetime",to_timestamp("timestamp")) \
    .show(truncate=False)

# Using Cast to convert TimestampType to DateType
df.withColumn('timestamp_string', \
              to_timestamp('timestamp').cast('string')) \
    .show(truncate=False)

+---+-----------------------+-------------------+
|id |timestamp              |timestamp_datetime |
+---+-----------------------+-------------------+
|1  |2023-03-19 08:57:00.000|2023-03-19 08:57:00|
+---+-----------------------+-------------------+

+---+-----------------------+-------------------+
|id |timestamp              |timestamp_string   |
+---+-----------------------+-------------------+
|1  |2023-03-19 08:57:00.000|2023-03-19 08:57:00|
+---+-----------------------+-------------------+



In [100]:
#SQL string to TimestampType
spark.sql("select to_timestamp('2023-03-19 08:57:00.000') as timestamp").show()
#SQL CAST timestamp string to TimestampType
spark.sql("select timestamp('2023-03-19 08:57:00.000') as timestamp").show()
#SQL Custom string to TimestampType
spark.sql("select to_timestamp('03-19-2023 08:57:00.000','MM-dd-yyyy HH:mm:ss.SSSS') as timestamp").show()

+-------------------+
|          timestamp|
+-------------------+
|2023-03-19 08:57:00|
+-------------------+

+-------------------+
|          timestamp|
+-------------------+
|2023-03-19 08:57:00|
+-------------------+

+-------------------+
|          timestamp|
+-------------------+
|2023-03-19 08:57:00|
+-------------------+

