# Joins in `pyspark`

Performed with `df_left.join(df_right, how=type_str)`

In [1]:
from pyspark.sql import SparkSession
from more_pyspark import to_pandas
spark = SparkSession.builder.appName('Ops').getOrCreate()
deptk = spark.read.csv("./data/department.csv",  header=True, inferSchema=True)
deptk.collect() >> to_pandas

Unnamed: 0,DeptID,DeptName
0,31,Sales
1,33,Engineering
2,34,Clerical
3,35,Marketing


In [2]:
emplk = spark.read.csv("./data/employee.csv",  header=True, inferSchema=True)
emplk.collect() >> to_pandas

Unnamed: 0,DeptID,LastName
0,31.0,Rafferty
1,33.0,Jones
2,33.0,Heisenberg
3,34.0,Robinson
4,34.0,Smith
5,,Williams


#### Inner join

In [3]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='inner')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Jones,Engineering
2,33,Heisenberg,Engineering
3,34,Robinson,Clerical
4,34,Smith,Clerical


#### Left join

In [4]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='left')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31.0,Rafferty,Sales
1,33.0,Jones,Engineering
2,33.0,Heisenberg,Engineering
3,34.0,Robinson,Clerical
4,34.0,Smith,Clerical
5,,Williams,


#### Right join

In [5]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='right')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Heisenberg,Engineering
2,33,Jones,Engineering
3,34,Smith,Clerical
4,34,Robinson,Clerical
5,35,,Marketing


#### Outer join

In [6]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='outer')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,,Williams,
1,35.0,,Marketing
2,34.0,Robinson,Clerical
3,34.0,Smith,Clerical
4,31.0,Rafferty,Sales
5,33.0,Jones,Engineering
6,33.0,Heisenberg,Engineering


## <font color="red"> Exercise 2 </font>

Determine all the players that have hit more than 100 home runs in a season.  The final table should include the players proper name, as well as the team name.  

**Hint:** You will need join the files listed below.  To get credit for this exercise, use the join `pyspark` join methods presented above.

In [11]:
files = ("./data/baseball/core/Batting.csv", 
              "./data/baseball/core/People.csv",
              "./data/baseball/core/Teams.csv")

In [None]:
battingk, peoplek, teamsk = [spark.read.csv(f,  header=True, inferSchema=True) for f in files]

In [21]:
batting_reducedk =battingk.select('playerID', 'yearID', "HR", 'teamID').filter(battingk['HR'] >= 50)


In [24]:
people_reducedk = peoplek.select('playerID', 'nameGiven')


In [26]:
teams_reducedk = teamsk.select('teamID', 'yearID', 'name')


In [36]:
HR_over50 = (batting_reducedk
             .join(people_reducedk, batting_reducedk.playerID == people_reducedk.playerID, how='left')
             .join(teams_reducedk, (batting_reducedk.teamID == teams_reducedk.teamID)&
                   (batting_reducedk.yearID == teams_reducedk.yearID), how='left'))
HR_over50.collect() >> to_pandas

Unnamed: 0,playerID,yearID,HR,teamID,nameGiven,name
0,ruthba01,1920,54,NYA,George Herman,New York Yankees
1,ruthba01,1921,59,NYA,George Herman,New York Yankees
2,ruthba01,1927,60,NYA,George Herman,New York Yankees
3,ruthba01,1928,54,NYA,George Herman,New York Yankees
4,wilsoha01,1930,56,CHN,Lewis Robert,Chicago Cubs
5,foxxji01,1932,58,PHA,James Emory,Philadelphia Athletics
6,foxxji01,1938,50,BOS,James Emory,Boston Red Sox
7,greenha01,1938,58,DET,Henry Benjamin,Detroit Tigers
8,kinerra01,1947,51,PIT,Ralph McPherran,Pittsburgh Pirates
9,mizejo01,1947,51,NY1,John Robert,New York Giants


## Up Next

Stuff