In [1]:
import findspark 
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
config.setMaster("local").setAppName("DFJoin")

from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=config).getOrCreate()

22/06/04 01:01:24 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/06/04 01:01:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark-3.1.3-bin-hadoop2.7/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-68e64ca5-aeee-40e5-b8ff-1326147cc002;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.3 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.3 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 1925ms :: artifacts dl 32ms
	:: modules in use:
	com.github.luben#zstd-jni;1.4.8-1 from central in [default]
	org.apache.commons#commons-pool2;2.6.2 from centra

In [3]:
products = [ 
          # (product_id, product_name, brand_id)  
         (1, 'iPhone', 100),
         (2, 'Galaxy', 200),
         (3, 'Redme', 300), #   no matching brand
         (4, 'Pixel', 400),
]

brands = [
    #(brand_id, brand_name)
    (100, "Apple"),
    (200, "Samsung"),
    (400, "Google"),
    (500, "Sony"), # no matching products
]
 
productDf = spark.createDataFrame(data=products, schema=["product_id", "product_name", "brand_id"])
brandDf = spark.createDataFrame(data=brands, schema=["brand_id", "brand_name"])
productDf.show()
brandDf.show()

productDf.createOrReplaceTempView("products")
brandDf.createOrReplaceTempView("brands")

                                                                                

+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         1|      iPhone|     100|
|         2|      Galaxy|     200|
|         3|       Redme|     300|
|         4|       Pixel|     400|
+----------+------------+--------+

+--------+----------+
|brand_id|brand_name|
+--------+----------+
|     100|     Apple|
|     200|   Samsung|
|     400|    Google|
|     500|      Sony|
+--------+----------+



In [4]:
# Inner Join
# productDf is left
# brandDf is right
# inner join: select/pick only matching record, discord if no matches found
# join returns a new df
df = productDf.join(brandDf, productDf.brand_id  ==  brandDf["brand_id"], "inner") 
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- brand_name: string (nullable = true)



                                                                                

+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
+----------+------------+--------+--------+----------+



In [7]:
spark.sql("""
SELECT products.*, brands.brand_name FROM products
INNER JOIN brands ON products.brand_id = brands.brand_id
""").show()



+----------+------------+--------+----------+
|product_id|product_name|brand_id|brand_name|
+----------+------------+--------+----------+
|         1|      iPhone|     100|     Apple|
|         2|      Galaxy|     200|   Samsung|
|         4|       Pixel|     400|    Google|
+----------+------------+--------+----------+



                                                                                

In [7]:
# how to remove specific column
df = productDf.join(brandDf, productDf.brand_id  ==  brandDf["brand_id"], "inner")\
              .drop(brandDf["brand_id"])      
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- brand_name: string (nullable = true)





+----------+------------+--------+----------+
|product_id|product_name|brand_id|brand_name|
+----------+------------+--------+----------+
|         1|      iPhone|     100|     Apple|
|         2|      Galaxy|     200|   Samsung|
|         4|       Pixel|     400|    Google|
+----------+------------+--------+----------+



                                                                                

In [8]:
# Outer Join, Full Outer Outer, [Left outer + Right outer]
# pick all records from left dataframe, and also right dataframe
# if no matches found, it fills null data for not matched records
productDf.join(brandDf, productDf["brand_id"] ==  brandDf["brand_id"], "outer").show()



+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|      null|        null|    null|     500|      Sony|
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
|         3|       Redme|     300|    null|      null|
+----------+------------+--------+--------+----------+



                                                                                

In [9]:
spark.sql("""
SELECT products.*, brands.* FROM products
FULL OUTER JOIN brands ON products.brand_id = brands.brand_id
""").show()



+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|      null|        null|    null|     500|      Sony|
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
|         3|       Redme|     300|    null|      null|
+----------+------------+--------+--------+----------+



                                                                                

In [9]:
# Left, Left Outer join 
# picks all records from left, if no matches found, it fills null for right data
productDf.join(brandDf, productDf["brand_id"] ==  brandDf["brand_id"], "leftouter").show()



+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
|         3|       Redme|     300|    null|      null|
+----------+------------+--------+--------+----------+



                                                                                

In [10]:
spark.sql("""
SELECT products.*, brands.* FROM products
LEFT OUTER JOIN brands ON products.brand_id = brands.brand_id
""").show()



+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
|         3|       Redme|     300|    null|      null|
+----------+------------+--------+--------+----------+



                                                                                

In [10]:
# Right, Right outer Join
# picks all the records from right, if no matches found, fills left data with null
productDf.join(brandDf, productDf["brand_id"] ==  brandDf["brand_id"], "rightouter").show()



+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|      null|        null|    null|     500|      Sony|
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
+----------+------------+--------+--------+----------+



                                                                                

In [11]:
spark.sql("""
SELECT products.*, brands.* FROM products
RIGHT OUTER JOIN brands ON products.brand_id = brands.brand_id
""").show()



+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|      null|        null|    null|     500|      Sony|
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
+----------+------------+--------+--------+----------+



                                                                                

In [11]:
# left semi join
# join in general convention, it pull the records from both right and left, join them based on condition
# left semi join, join left and right based on condition, however it pull the records only from left side

# it is similar to innerjoin, but pick/project records only from left
# we can't see brand_id, brand_name from brands df
productDf.join(brandDf, productDf["brand_id"] ==  brandDf["brand_id"], "leftsemi").show()



+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         1|      iPhone|     100|
|         2|      Galaxy|     200|
|         4|       Pixel|     400|
+----------+------------+--------+



                                                                                

In [13]:
spark.sql("""
SELECT products.* FROM products
LEFT SEMI JOIN brands ON products.brand_id = brands.brand_id
""").show()



+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         1|      iPhone|     100|
|         2|      Galaxy|     200|
|         4|       Pixel|     400|
+----------+------------+--------+



                                                                                

In [12]:
# left anti join: exact opposite to semi join
# picks the records that doesn't have match on the right side
productDf.join(brandDf, productDf["brand_id"] ==  brandDf["brand_id"], "leftanti").show()



+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         3|       Redme|     300|
+----------+------------+--------+



                                                                                

In [15]:
spark.sql("""
SELECT products.* FROM products
LEFT ANTI   JOIN brands ON products.brand_id = brands.brand_id
""").show()

                                                                                

+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         3|       Redme|     300|
+----------+------------+--------+



In [16]:

store = [
    #(store_id, store_name)
    (1000, "Poorvika"),
    (2000, "Sangeetha"),
    (4000, "Amazon"),
    (5000, "FlipKart"), 
]
 
storeDf = spark.createDataFrame(data=store, schema=["store_id", "store_name"])
storeDf.show()
store.createOrReplaceTempView("stores")


+--------+----------+
|store_id|store_name|
+--------+----------+
|    1000|  Poorvika|
|    2000| Sangeetha|
|    4000|    Amazon|
|    5000|  FlipKart|
+--------+----------+



AttributeError: 'list' object has no attribute 'createOrReplaceTempView'

In [14]:

# cartesian , take row from left side, pair with all from right side
productDf.crossJoin(storeDf).show()

+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|store_id|store_name|
+----------+------------+--------+--------+----------+
|         1|      iPhone|     100|    1000|  Poorvika|
|         1|      iPhone|     100|    2000| Sangeetha|
|         1|      iPhone|     100|    4000|    Amazon|
|         1|      iPhone|     100|    5000|  FlipKart|
|         2|      Galaxy|     200|    1000|  Poorvika|
|         2|      Galaxy|     200|    2000| Sangeetha|
|         2|      Galaxy|     200|    4000|    Amazon|
|         2|      Galaxy|     200|    5000|  FlipKart|
|         3|       Redme|     300|    1000|  Poorvika|
|         3|       Redme|     300|    2000| Sangeetha|
|         3|       Redme|     300|    4000|    Amazon|
|         3|       Redme|     300|    5000|  FlipKart|
|         4|       Pixel|     400|    1000|  Poorvika|
|         4|       Pixel|     400|    2000| Sangeetha|
|         4|       Pixel|     400|    4000|    Amazon|
|         

In [17]:
spark.sql("""
SELECT products.*, brands.* FROM products
CROSS JOIN brands
""").show()

+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|         1|      iPhone|     100|     100|     Apple|
|         1|      iPhone|     100|     200|   Samsung|
|         1|      iPhone|     100|     400|    Google|
|         1|      iPhone|     100|     500|      Sony|
|         2|      Galaxy|     200|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         2|      Galaxy|     200|     400|    Google|
|         2|      Galaxy|     200|     500|      Sony|
|         3|       Redme|     300|     100|     Apple|
|         3|       Redme|     300|     200|   Samsung|
|         3|       Redme|     300|     400|    Google|
|         3|       Redme|     300|     500|      Sony|
|         4|       Pixel|     400|     100|     Apple|
|         4|       Pixel|     400|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
|         