# Pandas SQL : Joins & Sub-Queries

Inner Join

Outer Join: Left, Right, Full 

Cross Join 

Self Join

Correlated Sub-Query 

In [1]:
import numpy as np
import pandas as pd
import pandasql as ps

### Example : Inner, Outer, Cross Joins

Unique Keys : One-to-One

In [2]:
customerid   = [1,2,3,4,5,6,7,8,9]
customername = ["Jone","David","Alice","Bobby","Lisa","Anne","Adam","Nery","Joe"]
df_customer  = pd.DataFrame({"CustID": customerid, "CustName": customername})
print(' ')
purchaseid = [1,3,5,11,19,27]
productid  = [65,80,35,80,55,35]
df_product = pd.DataFrame({"PurchaseID": purchaseid, "ProdID": productid})

 


In [3]:
df_customer.shape

(9, 2)

In [4]:
df_product.shape

(6, 2)

In [5]:
q = """
    SELECT 
    c.*, 
    p.ProdID
    FROM df_customer AS c
    LEFT JOIN
    df_product as p
    ON c.CustID = p.PurchaseID  
    """
a = ps.sqldf(q)
print(a)

   CustID CustName  ProdID
0       1     Jone    65.0
1       2    David     NaN
2       3    Alice    80.0
3       4    Bobby     NaN
4       5     Lisa    35.0
5       6     Anne     NaN
6       7     Adam     NaN
7       8     Nery     NaN
8       9      Joe     NaN


In [6]:
q = """
    SELECT 
    c.*, 
    p.ProdID
    FROM df_customer AS c
    LEFT JOIN
    df_product as p
    ON c.CustID = p.PurchaseID
    WHERE p.ProdID IS NOT NULL
    """
a = ps.sqldf(q)
print(a)

   CustID CustName  ProdID
0       1     Jone      65
1       3    Alice      80
2       5     Lisa      35


Duplicate Keys: One-to-Many

In [7]:
customerid   = [1,2,3,4,5]
customername = ["Jone","David","Alice","Bobby","Lisa"]
df_customer  = pd.DataFrame({"CustID": customerid, "CustName": customername})
print(' ')
purchaseid = [1,3,15,4,3,3,4,27]
productid  = [65,80,35,80,55,35,40,80]
df_product = pd.DataFrame({"PurchaseID": purchaseid, "ProdID": productid})

 


In [8]:
q = """
    SELECT c.*, p.ProdID
    FROM df_customer AS c
    LEFT JOIN
    df_product as p
    ON c.CustID = p.PurchaseID  
    """
a = ps.sqldf(q)
print(a)

   CustID CustName  ProdID
0       1     Jone    65.0
1       2    David     NaN
2       3    Alice    35.0
3       3    Alice    55.0
4       3    Alice    80.0
5       4    Bobby    40.0
6       4    Bobby    80.0
7       5     Lisa     NaN


In [9]:
q = """
    SELECT c.*, p.ProdID
    FROM df_customer AS c
    INNER JOIN
    df_product as p
    ON c.CustID = p.PurchaseID  
    """
a = ps.sqldf(q)
print(a)

   CustID CustName  ProdID
0       1     Jone      65
1       3    Alice      35
2       3    Alice      55
3       3    Alice      80
4       4    Bobby      40
5       4    Bobby      80


In [10]:
"""Cross-Join"""
tab1 = df_customer.copy()
tab2 = df_product.copy()
tab1["key"] = pd.Series(np.ones(df_customer.shape[0]))
tab2["key"] = pd.Series(np.ones(df_product.shape[0]))
a = tab1.merge(tab2, how="inner", on="key")
# print(a)

In [11]:
del tab1, tab2

### Example : Anti-Join

In [12]:
"""Left-Anti Join"""
q = """
    SELECT 
    c.*, 
    p.ProdID
    FROM df_customer AS c
    LEFT JOIN
    df_product as p
    ON c.CustID = p.PurchaseID
    WHERE p.PurchaseID IS NULL
    """
a = ps.sqldf(q)
print(a)

   CustID CustName ProdID
0       2    David   None
1       5     Lisa   None


In [13]:
"""Right-Anti Join"""
q = """
    SELECT 
    p.PurchaseID as CustID, 
    c.CustName, p.ProdID
    FROM df_product as p 
    LEFT JOIN
    df_customer AS c
    ON c.CustID = p.PurchaseID
    WHERE c.CustID IS NULL
    """
a = ps.sqldf(q)
print(a)

   CustID CustName  ProdID
0      15     None      35
1      27     None      80


In [14]:
"""Full-Anti Join"""
q = """
    SELECT c.*, p.ProdID
    FROM df_customer AS c
    LEFT JOIN
    df_product as p
    ON c.CustID = p.PurchaseID
    WHERE p.PurchaseID IS NULL
    UNION
    SELECT p.PurchaseID as CustID, c.CustName, p.ProdID
    FROM df_product as p 
    LEFT JOIN
    df_customer AS c
    ON c.CustID = p.PurchaseID
    WHERE c.CustID IS NULL
    """
a = ps.sqldf(q)
print(a)

   CustID CustName  ProdID
0       2    David     NaN
1       5     Lisa     NaN
2      15     None    35.0
3      27     None    80.0


### Example : Self-Join

In [15]:
empid     = [1,2,3,4,5,6,7,8,9,10]
empname   = ["Jone","David","Alice","Bobby","Lisa","Anne","Romero","Adam","Nery","Joe"]
empsalary = [4765,8100,5635,3315,6000,7495,6985,3870,5150,6370]
managerid = [8,None,8,7,8,10,None,10,10,7]

tab = pd.DataFrame({"EmpID": empid, "EmpName": empname, "ManagerID": managerid, "EmpSalary": empsalary})

In [16]:
print(tab)

   EmpID EmpName  ManagerID  EmpSalary
0      1    Jone        8.0       4765
1      2   David        NaN       8100
2      3   Alice        8.0       5635
3      4   Bobby        7.0       3315
4      5    Lisa        8.0       6000
5      6    Anne       10.0       7495
6      7  Romero        NaN       6985
7      8    Adam       10.0       3870
8      9    Nery       10.0       5150
9     10     Joe        7.0       6370


For Each Manager, Find the Average Salary 

In [17]:
q = """
    SELECT tab.*, f1.AvgSalary
    FROM tab
    LEFT JOIN
    (SELECT 
    CAST(ManagerID AS INT) ManagerID, 
    CAST(avg(EmpSalary) AS INT) as AvgSalary
    FROM tab
    WHERE ManagerID IS NOT NULL
    GROUP BY ManagerID) AS f1
    ON CAST(tab.managerID AS INT) = f1.ManagerID 
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID  EmpSalary  AvgSalary
0      1    Jone        8.0       4765     5466.0
1      2   David        NaN       8100        NaN
2      3   Alice        8.0       5635     5466.0
3      4   Bobby        7.0       3315     4842.0
4      5    Lisa        8.0       6000     5466.0
5      6    Anne       10.0       7495     5505.0
6      7  Romero        NaN       6985        NaN
7      8    Adam       10.0       3870     5505.0
8      9    Nery       10.0       5150     5505.0
9     10     Joe        7.0       6370     4842.0


For Each Manager, Find Employees Who Earn Below Average Salary 

In [18]:
q = """
    SELECT * FROM (
    SELECT tab.*, f1.AvgSalary
    FROM tab
    LEFT JOIN
    (SELECT CAST(ManagerID AS INT) ManagerID, CAST(avg(EmpSalary) AS INT) as AvgSalary
    FROM tab
    WHERE ManagerID IS NOT NULL
    GROUP BY ManagerID) AS f1
    ON CAST(tab.managerID AS INT) = f1.ManagerID ) AS f2
    WHERE EmpSalary < AvgSalary
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID  EmpSalary  AvgSalary
0      1    Jone        8.0       4765       5466
1      4   Bobby        7.0       3315       4842
2      8    Adam       10.0       3870       5505
3      9    Nery       10.0       5150       5505


For Each Employee, Find the Corresponsding Manager's Name & Salary 

In [19]:
q = """
    SELECT 
    tab.EmpID, tab.EmpName, 
    tab.ManagerID,f2.ManagerName, 
    tab.EmpSalary,f2.ManagerSalary
    FROM tab 
    LEFT JOIN
    (SELECT 
    EmpID as ManagerID, 
    EmpName as ManagerName,
    EmpSalary as ManagerSalary
    FROM tab
    INNER JOIN
    (SELECT 
    DISTINCT CAST(ManagerID as INT) AS tmpID
    FROM tab
    WHERE ManagerID IS NOT NULL) AS f1
    ON tab.EmpID = f1.tmpID) AS f2
    ON CAST(tab.ManagerID as INT) = f2.ManagerID
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID ManagerName  EmpSalary  ManagerSalary
0      1    Jone        8.0        Adam       4765         3870.0
1      2   David        NaN        None       8100            NaN
2      3   Alice        8.0        Adam       5635         3870.0
3      4   Bobby        7.0      Romero       3315         6985.0
4      5    Lisa        8.0        Adam       6000         3870.0
5      6    Anne       10.0         Joe       7495         6370.0
6      7  Romero        NaN        None       6985            NaN
7      8    Adam       10.0         Joe       3870         6370.0
8      9    Nery       10.0         Joe       5150         6370.0
9     10     Joe        7.0      Romero       6370         6985.0


Find Employees who Earn more than their Managers

In [20]:
"""Using INNER JOIN"""
q = """
    SELECT * FROM 
    (SELECT 
    tab.*, 
    f2.ManagerSalary
    FROM tab 
    LEFT JOIN
    (SELECT EmpID as ManagerID, EmpSalary as ManagerSalary
    FROM tab
    INNER JOIN
    (SELECT DISTINCT CAST(ManagerID as INT) AS tmpID
    FROM tab
    WHERE ManagerID IS NOT NULL) AS f1
    ON tab.EmpID = f1.tmpID) AS f2
    ON CAST(tab.ManagerID as INT) = f2.ManagerID) AS f3
    WHERE f3.EmpSalary > f3.ManagerSalary
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID  EmpSalary  ManagerSalary
0      1    Jone        8.0       4765           3870
1      3   Alice        8.0       5635           3870
2      5    Lisa        8.0       6000           3870
3      6    Anne       10.0       7495           6370


In [21]:
"""Using IN Operator"""
q = """
    SELECT * FROM 
    (SELECT 
    tab.*, 
    f1.ManagerSalary
    FROM tab 
    LEFT JOIN
    (SELECT EmpID as ManagerID, EmpSalary as ManagerSalary FROM tab
    WHERE EmpID IN 
    (SELECT DISTINCT CAST(ManagerID as INT) FROM tab
    WHERE ManagerID IS NOT NULL)) AS f1
    ON CAST(tab.ManagerID as INT) = f1.ManagerID) AS f2
    WHERE f2.EmpSalary > f2.ManagerSalary
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID  EmpSalary  ManagerSalary
0      1    Jone        8.0       4765           3870
1      3   Alice        8.0       5635           3870
2      5    Lisa        8.0       6000           3870
3      6    Anne       10.0       7495           6370


In [22]:
"""Using CTE"""
q = """
    WITH 
    get_distinct_id AS
    (SELECT 
    DISTINCT CAST(ManagerID as INT) AS distinct_id 
    FROM tab
    WHERE ManagerID IS NOT NULL), 
    get_distinct_id_salary AS
    (SELECT 
    EmpID as manager_id,
    EmpName as manager_name,
    EmpSalary as manager_salary 
    FROM tab),
    get_manager_salary AS
    (SELECT a.manager_id, a.manager_name, a.manager_salary
    FROM get_distinct_id_salary a
    INNER JOIN get_distinct_id b
    ON a.manager_id = b.distinct_id)
    SELECT 
    c.*,
    d.manager_salary
    FROM tab c
    LEFT JOIN get_manager_salary d
    ON CAST(c.ManagerID AS INT) = d.manager_id
    WHERE c.EmpSalary > d.manager_salary
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID  EmpSalary  manager_salary
0      1    Jone        8.0       4765            3870
1      3   Alice        8.0       5635            3870
2      5    Lisa        8.0       6000            3870
3      6    Anne       10.0       7495            6370


### Correlated Sub-Query

In [23]:
"""Using Correlated Sub-query, '=' Operator"""
q = """
    SELECT 
    o.EmpID as ManagerID, 
    o.EmpName as ManagerName,
    o.EmpSalary as ManagerSalary 
    FROM tab o
    WHERE EmpID = 
    (SELECT 
    DISTINCT CAST(ManagerID as INT) AS distinct_id
    FROM tab i
    WHERE distinct_id = o.EmpID)
    """
a = ps.sqldf(q)
print(a)

   ManagerID ManagerName  ManagerSalary
0          7      Romero           6985
1          8        Adam           3870
2         10         Joe           6370


In [24]:
"""Using Correlated Sub-query, 'Exists' Operator"""
q = """
    SELECT 
    o.EmpID as ManagerID, 
    o.EmpName as ManagerName,
    o.EmpSalary as ManagerSalary 
    FROM tab o
    WHERE EXISTS
    (SELECT 
    DISTINCT CAST(ManagerID as INT) AS distinct_id
    FROM tab i
    WHERE distinct_id = o.EmpID)
    """
a = ps.sqldf(q)
print(a)

   ManagerID ManagerName  ManagerSalary
0          7      Romero           6985
1          8        Adam           3870
2         10         Joe           6370


In [25]:
q = """
    SELECT 
    o.EmpID as EmpID, 
    o.EmpName as EmpName,
    o.ManagerID as ManagerID,
    o.EmpSalary as EmpSalary 
    FROM tab o
    WHERE EmpSalary < 
    (SELECT AVG(EmpSalary) FROM tab i 
    WHERE i.ManagerID IS NOT NULL 
    AND CAST(i.ManagerID AS INT) = CAST(o.ManagerID AS INT))
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID  EmpSalary
0      1    Jone        8.0       4765
1      4   Bobby        7.0       3315
2      8    Adam       10.0       3870
3      9    Nery       10.0       5150
