In [2]:
data1 = [(7369, "SMITH", "CLERK", 7902, "17-DEC-2005", 800, None, 20),
        (7499, "ALLEN", "SALESMAN",7698,"20-FEB-2006",1600,300,  30),
        (7521, "WARD", "SALESMAN", 7698,"22-FEB-2006",1250,500,  30),
        (7566, "JONES", "MANAGER", 7839, "02-APR-2006",2975,None,20),
        (7654, "MARTIN", "SALESMAN", 7698, "28-SEP-2006",1250,1400,30),
        (7698, "BLAKE", "MANAGER", 7839, "01-MAY-2006", 2850, None,30),
        (7782, "CLARK", "MANAGER", 7839, "09-JUN-2006", 2450, None, 10),
        (7788, "SCOTT", "ANALYST", 7566, "09-DEC-2007", 3000, None, 20),
        (7839, "KING", "PRESIDENT",None, "17-NOV-2006", 5000, None, 10),
        (7844, "Turner", "SALESMAN",7698, "08-SEP-2006", 1500, 0,  30),
        (7876, "ADAMS", "CLERK",   7788,  "12-JAN-2008", 1100, None, 20),
        (7900, "JAMES",  "CLERK",  7698,  "03-DEC-2006", 950, None, 30),
        (7902, "FORD",  "ANALYST", 7566,  "03-DEC-2006", 3000, None, 20),
        (7934, "MILLER", "CLERK",  7782,  "23-JAN-2007", 1300, None, 10)
       ]
schema1 = ["empno","ename","job","mgr","hiredate","sal","comm","deptno"]
        

In [3]:
data2 = [(10, "ACCOUNTING", "NEW YORK"),
         (20, "RESEARCH", "DALLAS"),
         (30, "SALES",    "CHICAGO"),
         (40, "OPERATIONS", "BOSTON") ]
schema2 = ["deptno", "dname", "loc"]

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate() 
emp_df=spark.createDataFrame(data = data1, schema = schema1) 
dept_df=spark.createDataFrame(data = data2, schema = schema2)

In [7]:
emp_df.createOrReplaceTempView('emp')
dept_df.createOrReplaceTempView('dept')

In [9]:
spark.sql("select * from dept").show()

+------+----------+--------+
|deptno|     dname|     loc|
+------+----------+--------+
|    10|ACCOUNTING|NEW YORK|
|    20|  RESEARCH|  DALLAS|
|    30|     SALES| CHICAGO|
|    40|OPERATIONS|  BOSTON|
+------+----------+--------+



In [10]:
lst = [row[0] for row in emp_df.select('deptno').collect()] 

In [11]:
lst

[20, 30, 30, 20, 30, 30, 10, 20, 10, 30, 20, 30, 20, 10]

In [13]:
dept_df.select('deptno').where(~dept_df.deptno.isin(lst)).show()

+------+
|deptno|
+------+
|    40|
+------+



In [14]:
dept_df.select('*').where(~dept_df.deptno.isin(lst)).show()

+------+----------+------+
|deptno|     dname|   loc|
+------+----------+------+
|    40|OPERATIONS|BOSTON|
+------+----------+------+



In [12]:
spark.sql('''select d.*, e.deptno 
              from dept d left outer join emp e
              on (d.deptno = e.deptno) ''').show()
              

+------+----------+--------+------+
|deptno|     dname|     loc|deptno|
+------+----------+--------+------+
|    10|ACCOUNTING|NEW YORK|    10|
|    10|ACCOUNTING|NEW YORK|    10|
|    10|ACCOUNTING|NEW YORK|    10|
|    20|  RESEARCH|  DALLAS|    20|
|    20|  RESEARCH|  DALLAS|    20|
|    20|  RESEARCH|  DALLAS|    20|
|    20|  RESEARCH|  DALLAS|    20|
|    20|  RESEARCH|  DALLAS|    20|
|    30|     SALES| CHICAGO|    30|
|    30|     SALES| CHICAGO|    30|
|    30|     SALES| CHICAGO|    30|
|    30|     SALES| CHICAGO|    30|
|    30|     SALES| CHICAGO|    30|
|    30|     SALES| CHICAGO|    30|
|    40|OPERATIONS|  BOSTON|  NULL|
+------+----------+--------+------+



In [28]:
df = dept_df.join(emp_df, dept_df.deptno==emp_df.deptno, "leftouter").select(dept_df.dname,dept_df.loc,emp_df.deptno)

In [31]:
df.show()
df.filter("deptno is NULL").show()

+----------+--------+------+
|     dname|     loc|deptno|
+----------+--------+------+
|ACCOUNTING|NEW YORK|    10|
|ACCOUNTING|NEW YORK|    10|
|ACCOUNTING|NEW YORK|    10|
|  RESEARCH|  DALLAS|    20|
|  RESEARCH|  DALLAS|    20|
|  RESEARCH|  DALLAS|    20|
|  RESEARCH|  DALLAS|    20|
|  RESEARCH|  DALLAS|    20|
|     SALES| CHICAGO|    30|
|     SALES| CHICAGO|    30|
|     SALES| CHICAGO|    30|
|     SALES| CHICAGO|    30|
|     SALES| CHICAGO|    30|
|     SALES| CHICAGO|    30|
|OPERATIONS|  BOSTON|  NULL|
+----------+--------+------+

+----------+------+------+
|     dname|   loc|deptno|
+----------+------+------+
|OPERATIONS|BOSTON|  NULL|
+----------+------+------+

