### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Tipos de Join --> Left join

In [0]:
from pyspark.sql.functions import lit, concat, col, desc, when

employee_data = [(10,"Raj","Kumar","1999","100","M",2000,'100'),
                 (20,"Sahul","Rajan","2002","200","F",3000,'200'),
                 (30,"Antonio","Manish","2010","300",None,9000,'300'),
                 (40,"Yaja","Singh","2004","100","F",1000,'200'),
                 (50,"Nama","Krish","2008","400","M",8000,'400'),
                 (60,"Alfonso","Kutty","2014","400","M",7000,'600'),
                 (70,"Tumar","Chand","2004","200","M",7000,'700'),
                 (80,"Andres","Soto","2003","800","M",4000,'700'),
                ]
employee_schema = ["employee_id","first_name","last_name","doj",
                   "dept_id","gender","salary","member_id"]

df = spark.createDataFrame(data=employee_data, schema=employee_schema)
df.printSchema()

root
 |-- employee_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- doj: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- member_id: string (nullable = true)



In [0]:
departamento = [('100','HR'),
                ('200','Supply'),
                ('300','Sales'),
                ('400','Stock'),
                ]
dept_schema = ['dept_id','dept_name']

df_dept = spark.createDataFrame(data=departamento, schema=dept_schema)
df_dept.printSchema()

root
 |-- dept_id: string (nullable = true)
 |-- dept_name: string (nullable = true)



#### Ejemplo 1

In [0]:
df_join = df.join(df_dept,df.dept_id == df_dept.dept_id,"left")

display(df_join)

employee_id,first_name,last_name,doj,dept_id,gender,salary,member_id,dept_id.1,dept_name
10,Raj,Kumar,1999,100,M,2000,100,100.0,HR
20,Sahul,Rajan,2002,200,F,3000,200,200.0,Supply
30,Antonio,Manish,2010,300,,9000,300,300.0,Sales
40,Yaja,Singh,2004,100,F,1000,200,100.0,HR
50,Nama,Krish,2008,400,M,8000,400,400.0,Stock
60,Alfonso,Kutty,2014,400,M,7000,600,400.0,Stock
70,Tumar,Chand,2004,200,M,7000,700,200.0,Supply
80,Andres,Soto,2003,800,M,4000,700,,


#### Ejemplo 2

In [0]:
df_join = df.join(df_dept, 'dept_id', 'left')

display(df_join)

dept_id,employee_id,first_name,last_name,doj,gender,salary,member_id,dept_name
100,10,Raj,Kumar,1999,M,2000,100,HR
200,20,Sahul,Rajan,2002,F,3000,200,Supply
300,30,Antonio,Manish,2010,,9000,300,Sales
100,40,Yaja,Singh,2004,F,1000,200,HR
400,50,Nama,Krish,2008,M,8000,400,Stock
400,60,Alfonso,Kutty,2014,M,7000,600,Stock
200,70,Tumar,Chand,2004,M,7000,700,Supply
800,80,Andres,Soto,2003,M,4000,700,


#### Ejemplo 3

In [0]:
df_join = df.join(df_dept,df.dept_id == df_dept.dept_id, 'left'). \
             select(df['*'],df_dept['dept_name']). \
             show(truncate=False)

display(df_join)

+-----------+----------+---------+----+-------+------+------+---------+---------+
|employee_id|first_name|last_name|doj |dept_id|gender|salary|member_id|dept_name|
+-----------+----------+---------+----+-------+------+------+---------+---------+
|10         |Raj       |Kumar    |1999|100    |M     |2000  |100      |HR       |
|20         |Sahul     |Rajan    |2002|200    |F     |3000  |200      |Supply   |
|30         |Antonio   |Manish   |2010|300    |null  |9000  |300      |Sales    |
|40         |Yaja      |Singh    |2004|100    |F     |1000  |200      |HR       |
|50         |Nama      |Krish    |2008|400    |M     |8000  |400      |Stock    |
|60         |Alfonso   |Kutty    |2014|400    |M     |7000  |600      |Stock    |
|70         |Tumar     |Chand    |2004|200    |M     |7000  |700      |Supply   |
|80         |Andres    |Soto     |2003|800    |M     |4000  |700      |null     |
+-----------+----------+---------+----+-------+------+------+---------+---------+



#### Ejemplo 4

In [0]:
df_join = df.alias('a'). \
             join(df_dept.alias('b'),df.dept_id == df_dept.dept_id, 'left'). \
             select('a.*','b.dept_name'). \
             show(truncate=False)

display(df_join)

+-----------+----------+---------+----+-------+------+------+---------+---------+
|employee_id|first_name|last_name|doj |dept_id|gender|salary|member_id|dept_name|
+-----------+----------+---------+----+-------+------+------+---------+---------+
|10         |Raj       |Kumar    |1999|100    |M     |2000  |100      |HR       |
|20         |Sahul     |Rajan    |2002|200    |F     |3000  |200      |Supply   |
|30         |Antonio   |Manish   |2010|300    |null  |9000  |300      |Sales    |
|40         |Yaja      |Singh    |2004|100    |F     |1000  |200      |HR       |
|50         |Nama      |Krish    |2008|400    |M     |8000  |400      |Stock    |
|60         |Alfonso   |Kutty    |2014|400    |M     |7000  |600      |Stock    |
|70         |Tumar     |Chand    |2004|200    |M     |7000  |700      |Supply   |
|80         |Andres    |Soto     |2003|800    |M     |4000  |700      |null     |
+-----------+----------+---------+----+-------+------+------+---------+---------+



#### Ejemplo 5

In [0]:
df_join = df.alias('a'). \
             join(df_dept.alias('b'),df.dept_id == df_dept.dept_id, 'left'). \
             filter('b.dept_name IS NULL'). \
             select('a.*','b.dept_name'). \
             show(truncate=False)

display(df_join)

+-----------+----------+---------+----+-------+------+------+---------+---------+
|employee_id|first_name|last_name|doj |dept_id|gender|salary|member_id|dept_name|
+-----------+----------+---------+----+-------+------+------+---------+---------+
|80         |Andres    |Soto     |2003|800    |M     |4000  |700      |null     |
+-----------+----------+---------+----+-------+------+------+---------+---------+



#### Ejemplo 6

In [0]:
df_join = df.join(df_dept, df.dept_id == df_dept.dept_id,'left'). \
             groupBy('dept_name'). \
             count(). \
             show(truncate=False)

display(df_join)

+---------+-----+
|dept_name|count|
+---------+-----+
|HR       |2    |
|Supply   |2    |
|Sales    |1    |
|Stock    |2    |
|null     |1    |
+---------+-----+

