# Pandas PandaSQL 

In [6]:
import numpy as np
import pandas as pd
import pandasql as ps

### Example 1

In [7]:
transactions = pd.DataFrame(
    {"date":["2020-01-01","2020-02-02","2020-03-03","2020-04-04","2020-05-05","2020-06-07",
             "2020-07-08",
             "2020-11-24","2020-11-28","2020-12-01","2020-12-04","2020-12-15","2020-12-28"], 
     "consumption": [35,60,30,35,55,90,
                     5,
                     50,100,100,10,50,10],
    "bag":['ch','hb','ch','hb','mk','zn',
           'ks',
           'mk','mk','mk','ks','mk','hb']}
)

In [8]:
transactions.head(3)

Unnamed: 0,date,consumption,bag
0,2020-01-01,35,ch
1,2020-02-02,60,hb
2,2020-03-03,30,ch


Holiday Sales

In [9]:
q = """ 
    WITH f1 AS (
    SELECT 
    bag,
    SUM(consumption) as  holiday_sale 
    FROM transactions
    WHERE date >= '2020-11-24' AND date <= '2020-12-15'
    GROUP BY bag)
    SELECT * FROM f1;
    """
a = ps.sqldf(q)
print(a)

  bag  holiday_sale
0  ks            10
1  mk           300


Non-holiday Sales

In [10]:
q = """
    WITH f2 AS(
    SELECT
    bag, 
    SUM(consumption) as  nonholiday_sale
    FROM transactions
    WHERE date < '2020-11-24' OR date > '2020-12-15'
    GROUP BY bag)
    SELECT * FROM f2;
    """
a = ps.sqldf(q)
print(a)

  bag  nonholiday_sale
0  ch               65
1  hb              105
2  ks                5
3  mk               55
4  zn               90


Use UNION

In [6]:
q = """ 
    WITH f1 AS(
    SELECT 
    bag,
    SUM(consumption) as  holiday_sale 
    FROM transactions
    WHERE date BETWEEN '2020-11-24' AND '2020-12-15'
    GROUP BY bag),
    
    f2 AS(
    SELECT
    bag, 
    SUM(consumption) as  nonholiday_sale
    FROM transactions
    WHERE date < '2020-11-24' OR date > '2020-12-15'
    GROUP BY bag),
    
    f3 AS(
    SELECT 
    f1.bag,
    CASE WHEN f1.holiday_sale IS NULL THEN 0
    ELSE f1.holiday_sale
    END AS holiday_sale, 
    CASE WHEN f2.nonholiday_sale IS NULL THEN 0
    ELSE f2.nonholiday_sale
    END AS nonholiday_sale
    FROM f1 
    LEFT JOIN f2
    ON f1.bag=f2.bag),
    
    f4 AS(
    SELECT 
    f2.bag,
    CASE WHEN f1.holiday_sale IS NULL THEN 0
    ELSE f1.holiday_sale
    END AS holiday_sale, 
    CASE WHEN f2.nonholiday_sale IS NULL THEN 0
    ELSE f2.nonholiday_sale
    END AS nonholiday_sale
    FROM f2 
    LEFT JOIN f1
    ON f2.bag=f1.bag)
    
    SELECT * FROM f3
    UNION
    SELECT* FROM f4;
    """
a = ps.sqldf(q)
print(a)

  bag  holiday_sale  nonholiday_sale
0  ch             0               65
1  hb             0              105
2  ks            10                5
3  mk           300               55
4  zn             0               90


Use FULL OUTER JOIN

In [7]:
q = """ 
    WITH f1 AS(
    SELECT 
    bag,
    SUM(consumption) as  holiday_sale 
    FROM transactions
    WHERE date BETWEEN '2020-11-24' AND '2020-12-15'
    GROUP BY bag),
    
    f2 AS(
    SELECT
    bag, 
    SUM(consumption) as  nonholiday_sale
    FROM transactions
    WHERE date < '2020-11-24' OR date > '2020-12-15'
    GROUP BY bag)
    
    SELECT 
    f1.*, 
    f2.*
    FROM f1
    FULL OUTER JOIN f2
    ON f1.bag=f2.bag;
    """
a = ps.sqldf(q)
print(a)

    bag  holiday_sale bag  nonholiday_sale
0    ks          10.0  ks                5
1    mk         300.0  mk               55
2  None           NaN  ch               65
3  None           NaN  hb              105
4  None           NaN  zn               90


### Example 2

In [8]:
dataArr = np.array(
    [[1,"Chais",1,1,"10 boxes x 20 bags",18],
    [2,"Chang",1,1,"24 - 12 oz bottles",19],
    [3,"Aniseed Syrup",1,2,"12 - 550 ml bottles", 10],
    [4,"Cajun Seasoning",2,2, "48 - 6 oz jars", 22],
    [5,"Gumbo Mix",2,2,"36 boxes",21.35],
    [6,"Boysenberry Spread", 3,2,"12 - 8 oz jars", 25],
    [7,"Organic Dried Pears",3,7,"12 - 1 lb pkgs.",30],
    [8,"Cranberry Sauce", 3,2,"12 - 12 oz jars",40],
    [9,"Mishi Kobe Niku", 4,6,"18 - 500 g pkgs.",97]]
)

productDetail = pd.DataFrame(
    dataArr, columns=['ProductID','ProductName','SupplierID','CategoryID','Unit','Price']
)
productDetail

Unnamed: 0,ProductID,ProductName,SupplierID,CategoryID,Unit,Price
0,1,Chais,1,1,10 boxes x 20 bags,18.0
1,2,Chang,1,1,24 - 12 oz bottles,19.0
2,3,Aniseed Syrup,1,2,12 - 550 ml bottles,10.0
3,4,Cajun Seasoning,2,2,48 - 6 oz jars,22.0
4,5,Gumbo Mix,2,2,36 boxes,21.35
5,6,Boysenberry Spread,3,2,12 - 8 oz jars,25.0
6,7,Organic Dried Pears,3,7,12 - 1 lb pkgs.,30.0
7,8,Cranberry Sauce,3,2,12 - 12 oz jars,40.0
8,9,Mishi Kobe Niku,4,6,18 - 500 g pkgs.,97.0


In [9]:
dataArr = np.array(
    [[1,10248,11,12],
     [2,10248,42,10],
     [3,10248,72,5],
     [4,10249,14,9],
     [5,10249,51,40],
     [6,10250,41,10],
     [7,10250,51,35],
     [8,10250,65,15],
     [9,10251,22,6],
     [10,10251,57,15]]
)
orderDetail = pd.DataFrame(dataArr, columns=['OrderDetailID','OrderID','ProductID','Quantity'])
orderDetail

Unnamed: 0,OrderDetailID,OrderID,ProductID,Quantity
0,1,10248,11,12
1,2,10248,42,10
2,3,10248,72,5
3,4,10249,14,9
4,5,10249,51,40
5,6,10250,41,10
6,7,10250,51,35
7,8,10250,65,15
8,9,10251,22,6
9,10,10251,57,15


Use of 'EXISTS'

In [10]:
q = """
    SELECT 
    ProductID,
    ProductName
    FROM 
    productDetail
    WHERE EXISTS
    (SELECT
    ProductID
    FROM orderDetail 
    WHERE Quantity=10);
    """
a = ps.sqldf(q)
print(a)

  ProductID          ProductName
0         1                Chais
1         2                Chang
2         3        Aniseed Syrup
3         4      Cajun Seasoning
4         5            Gumbo Mix
5         6   Boysenberry Spread
6         7  Organic Dried Pears
7         8      Cranberry Sauce
8         9      Mishi Kobe Niku


Use of 'ANY'

In [11]:
# q = """
#     SELECT ProductName
#     FROM productDetail
#     WHERE ProductID = ANY
#       (SELECT ProductID
#       FROM orderDetail
#       WHERE Quantity=10);
#     """
# a = ps.sqldf(q)
# print(a)

### Example 3

In [12]:
teachers = pd.DataFrame(
    {'id':[1,2,3,4,5],
    'name':['peter','megan','rose','linda','mary'],
    'age':[32,43,29,30,41]})

students = pd.DataFrame(
    {'id':[1,2,3,4,5],
    'name':['harry','jack','joe','dant','bruce'],
    'age':[23,42,32,23,40]})

Use of 'ANY'

In [13]:
# q = """
#     SELECT *
#     FROM teachers
#     WHERE age = ANY (
#       SELECT age
#       FROM students);
#     """
# a = ps.sqldf(q, globals())
# print(a)