In [26]:
import findspark as fs
fs.init()

In [27]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Schemas").master("local[2]").getOrCreate()


In [28]:
# emp = eid,name,yoj,dept,gender,salary
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DateType,ShortType,CharType
empData=[
    # (1,'James'),
    (1,'James',2019,10,'M',45),
    (2,'John',2020,40,'M',78),
    (3,'Jessica',2021,50,'M',23),
    (4,'Javier',2019,60,'M',12),
    (5,'June',2023,10,'M',85)
    ]

empSchema = StructType(
    [
        StructField('eid',IntegerType()),
        StructField('name',StringType()),
        StructField('yoj',IntegerType()),
        StructField('dept_id',IntegerType()),
        StructField('gender',StringType()),
        StructField('age',IntegerType())
    ]
)

empDf=spark.createDataFrame(empData,empSchema)
empDf.printSchema()

root
 |-- eid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- yoj: integer (nullable = true)
 |-- dept_id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)



In [29]:
deptData=[
    (10,'Sales'),
    (20,'Marketing'),
    (40,'IT'),
    (30,'HR'),
    ]

deptSchema = StructType(
    [
        StructField('dept_id',IntegerType()),
        StructField('dept',StringType()),
    ]
)

deptDf=spark.createDataFrame(deptData,deptSchema)
deptDf.printSchema()

root
 |-- dept_id: integer (nullable = true)
 |-- dept: string (nullable = true)



### JOIN


In [30]:
innerJoinDF = empDf.join(deptDf,empDf.dept_id==deptDf.dept_id,'inner')
innerJoinDF.show()

+---+-----+----+-------+------+---+-------+-----+
|eid| name| yoj|dept_id|gender|age|dept_id| dept|
+---+-----+----+-------+------+---+-------+-----+
|  1|James|2019|     10|     M| 45|     10|Sales|
|  5| June|2023|     10|     M| 85|     10|Sales|
|  2| John|2020|     40|     M| 78|     40|   IT|
+---+-----+----+-------+------+---+-------+-----+



In [31]:
leftJoinDF=empDf.join(deptDf,empDf.dept_id==deptDf.dept_id,'left')
leftJoinDF.show()

+---+-------+----+-------+------+---+-------+-----+
|eid|   name| yoj|dept_id|gender|age|dept_id| dept|
+---+-------+----+-------+------+---+-------+-----+
|  2|   John|2020|     40|     M| 78|     40|   IT|
|  1|  James|2019|     10|     M| 45|     10|Sales|
|  5|   June|2023|     10|     M| 85|     10|Sales|
|  3|Jessica|2021|     50|     M| 23|   null| null|
|  4| Javier|2019|     60|     M| 12|   null| null|
+---+-------+----+-------+------+---+-------+-----+



In [33]:
fullOuterJoinDF=empDf.join(deptDf,'dept_id','fullouter') 
#same as `USING`` keyword in mysql, no need to write `t1.c1==t2.c1` only column name if it is same in both tables
fullOuterJoinDF.show()

+-------+----+-------+----+------+----+---------+
|dept_id| eid|   name| yoj|gender| age|     dept|
+-------+----+-------+----+------+----+---------+
|     10|   1|  James|2019|     M|  45|    Sales|
|     10|   5|   June|2023|     M|  85|    Sales|
|     20|null|   null|null|  null|null|Marketing|
|     30|null|   null|null|  null|null|       HR|
|     40|   2|   John|2020|     M|  78|       IT|
|     50|   3|Jessica|2021|     M|  23|     null|
|     60|   4| Javier|2019|     M|  12|     null|
+-------+----+-------+----+------+----+---------+



In [38]:
leftSemiJoin=empDf.join(deptDf,'dept_id','leftsemi')
leftSemiJoin.show()

+-------+---+-----+----+------+---+
|dept_id|eid| name| yoj|gender|age|
+-------+---+-----+----+------+---+
|     10|  1|James|2019|     M| 45|
|     10|  5| June|2023|     M| 85|
|     40|  2| John|2020|     M| 78|
+-------+---+-----+----+------+---+



In [43]:
leftAntiJoin=empDf.join(deptDf,'dept_id','left_anti')
leftAntiJoin.show()

+-------+---+-------+----+------+---+
|dept_id|eid|   name| yoj|gender|age|
+-------+---+-------+----+------+---+
|     50|  3|Jessica|2021|     M| 23|
|     60|  4| Javier|2019|     M| 12|
+-------+---+-------+----+------+---+



In [42]:
#check documentation for extra
import pyspark
help(pyspark.sql.DataFrame.join)

Help on function join in module pyspark.sql.dataframe:

join(self, other: 'DataFrame', on: Union[str, List[str], pyspark.sql.column.Column, List[pyspark.sql.column.Column], NoneType] = None, how: Optional[str] = None) -> 'DataFrame'
    Joins with another :class:`DataFrame`, using the given join expression.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    other : :class:`DataFrame`
        Right side of the join
    on : str, list or :class:`Column`, optional
        a string for the join column name, a list of column names,
        a join expression (Column), or a list of Columns.
        If `on` is a string or a list of strings indicating the name of the join column(s),
        the column(s) must exist on both sides, and this performs an equi-join.
    how : str, optional
        default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
        ``full``, ``fullouter``, ``full_outer``, 