### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Tipos de Join --> Inner join

In [0]:
from pyspark.sql.functions import lit, concat, col, desc

employee_data = [(10,"Raj","Kumar","1999","100","M",2000,'100'),
                 (20,"Sahul","Rajan","2002","200","F",3000,'200'),
                 (30,"Antonio","Manish","2010","300",None,9000,'300'),
                 (40,"Yaja","Singh","2004","100","F",1000,'200'),
                 (50,"Nama","Krish","2008","400","M",8000,'400'),
                 (60,"Alfonso","Kutty","2014","400","M",7000,'600'),
                 (70,"Tumar","Chand","2004","200","M",7000,'700')
                ]
employee_schema = ["employee_id","first_name","last_name","doj",
                   "dept_id","gender","salary","member_id"]

df = spark.createDataFrame(data=employee_data, schema=employee_schema)
df.printSchema()

root
 |-- employee_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- doj: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- member_id: long (nullable = true)



In [0]:
departamento = [('100','HR'),
                ('200','Supply'),
                ('300','Sales'),
                ('400','Stock'),
                ]
dept_schema = ['dept_id','dept_name']

df_dept = spark.createDataFrame(data=departamento, schema=dept_schema)
df_dept.printSchema()

root
 |-- dept_id: string (nullable = true)
 |-- dept_name: string (nullable = true)



#### Ejemplo 1

In [0]:
df_join = df.join(df_dept,df.dept_id == df_dept.dept_id,"inner")

display(df_join)

employee_id,first_name,last_name,doj,dept_id,gender,salary,member_id,dept_id.1,dept_name
10,Raj,Kumar,1999,100,M,2000,10,100,HR
40,Yaja,Singh,2004,100,F,1000,20,100,HR
20,Sahul,Rajan,2002,200,F,3000,50,200,Supply
70,Tumar,Chand,2004,200,M,7000,70,200,Supply
30,Antonio,Manish,2010,300,,9000,30,300,Sales
50,Nama,Krish,2008,400,M,8000,40,400,Stock
60,Alfonso,Kutty,2014,400,M,7000,60,400,Stock


#### Ejemplo 2

In [0]:
df_join = df.join(df_dept,df.dept_id == df_dept.dept_id)

display(df_join)

employee_id,first_name,last_name,doj,dept_id,gender,salary,member_id,dept_id.1,dept_name
10,Raj,Kumar,1999,100,M,2000,10,100,HR
40,Yaja,Singh,2004,100,F,1000,20,100,HR
20,Sahul,Rajan,2002,200,F,3000,50,200,Supply
70,Tumar,Chand,2004,200,M,7000,70,200,Supply
30,Antonio,Manish,2010,300,,9000,30,300,Sales
50,Nama,Krish,2008,400,M,8000,40,400,Stock
60,Alfonso,Kutty,2014,400,M,7000,60,400,Stock


#### Ejemplo 3

Al utilizar el campo en común de esta forma, devuelve un DataFrame con esta columna solo una vez.
La columna debe tener el mismo nombre.

In [0]:
df_join = df.join(df_dept, 'dept_id')

display(df_join)

dept_id,employee_id,first_name,last_name,doj,gender,salary,member_id,dept_name
100,10,Raj,Kumar,1999,M,2000,10,HR
100,40,Yaja,Singh,2004,F,1000,20,HR
200,20,Sahul,Rajan,2002,F,3000,50,Supply
200,70,Tumar,Chand,2004,M,7000,70,Supply
300,30,Antonio,Manish,2010,,9000,30,Sales
400,50,Nama,Krish,2008,M,8000,40,Stock
400,60,Alfonso,Kutty,2014,M,7000,60,Stock


#### Ejemplo 4

In [0]:
df_join = df.join(df_dept,df.dept_id == df_dept.dept_id). \
             select(df['*'],df_dept['dept_name']). \
             show(truncate=False)

display(df_join)

+-----------+----------+---------+----+-------+------+------+---------+---------+
|employee_id|first_name|last_name|doj |dept_id|gender|salary|member_id|dept_name|
+-----------+----------+---------+----+-------+------+------+---------+---------+
|10         |Raj       |Kumar    |1999|100    |M     |2000  |10       |HR       |
|40         |Yaja      |Singh    |2004|100    |F     |1000  |20       |HR       |
|20         |Sahul     |Rajan    |2002|200    |F     |3000  |50       |Supply   |
|70         |Tumar     |Chand    |2004|200    |M     |7000  |70       |Supply   |
|30         |Antonio   |Manish   |2010|300    |null  |9000  |30       |Sales    |
|50         |Nama      |Krish    |2008|400    |M     |8000  |40       |Stock    |
|60         |Alfonso   |Kutty    |2014|400    |M     |7000  |60       |Stock    |
+-----------+----------+---------+----+-------+------+------+---------+---------+



#### Ejemplo 5

In [0]:
df_join = df.alias('a'). \
             join(df_dept.alias('b'),df.dept_id == df_dept.dept_id). \
             select('a.*','b.dept_name'). \
             show(truncate=False)

display(df_join)

+-----------+----------+---------+----+-------+------+------+---------+---------+
|employee_id|first_name|last_name|doj |dept_id|gender|salary|member_id|dept_name|
+-----------+----------+---------+----+-------+------+------+---------+---------+
|10         |Raj       |Kumar    |1999|100    |M     |2000  |10       |HR       |
|40         |Yaja      |Singh    |2004|100    |F     |1000  |20       |HR       |
|20         |Sahul     |Rajan    |2002|200    |F     |3000  |50       |Supply   |
|70         |Tumar     |Chand    |2004|200    |M     |7000  |70       |Supply   |
|30         |Antonio   |Manish   |2010|300    |null  |9000  |30       |Sales    |
|50         |Nama      |Krish    |2008|400    |M     |8000  |40       |Stock    |
|60         |Alfonso   |Kutty    |2014|400    |M     |7000  |60       |Stock    |
+-----------+----------+---------+----+-------+------+------+---------+---------+



#### Ejemplo 6

In [0]:
df_join = df.join(df_dept, df.dept_id == df_dept.dept_id). \
             groupBy('dept_name'). \
             count(). \
             show(truncate=False)

display(df_join)

+---------+-----+
|dept_name|count|
+---------+-----+
|Sales    |1    |
|Stock    |2    |
|HR       |2    |
|Supply   |2    |
+---------+-----+



In [0]:
df_join = df.join(df_dept, df.dept_id == df_dept.dept_id). \
             groupBy(df_dept['dept_name']). \
             count(). \
             show(truncate=False)

display(df_join)

+---------+-----+
|dept_name|count|
+---------+-----+
|Sales    |1    |
|Stock    |2    |
|HR       |2    |
|Supply   |2    |
+---------+-----+



In [0]:
df_join = df.alias('a'). \
             join(df_dept.alias('b'), df.dept_id == df_dept.dept_id). \
             groupBy('b.dept_name'). \
             count(). \
             show(truncate=False)

display(df_join)

+---------+-----+
|dept_name|count|
+---------+-----+
|Sales    |1    |
|Stock    |2    |
|HR       |2    |
|Supply   |2    |
+---------+-----+



In [0]:
df_join = df.join(df_dept, 'dept_id'). \
             groupBy('dept_name'). \
             count(). \
             show(truncate=False)

display(df_join)

+---------+-----+
|dept_name|count|
+---------+-----+
|Sales    |1    |
|Stock    |2    |
|HR       |2    |
|Supply   |2    |
+---------+-----+

