# Pandas & SQL : Practice

In [1]:
import numpy as np
import pandas as pd

In [2]:
import pandasql as ps

### Example : Find the 5th Highest Salary 

In [3]:
eid  = [1,2,3,4,5,6,7,8]
name = ["Jone Doe","David Boe","Alice Jones","Bobby Louis","Lisa Romero","Anne Marry","Kisa Salero","Adam Smith"]
salary = [3315,7790,6000,3315,6000,8100,6985,7450]
df = pd.DataFrame({"id":eid, "name":name, "salary":salary})
df

Unnamed: 0,id,name,salary
0,1,Jone Doe,3315
1,2,David Boe,7790
2,3,Alice Jones,6000
3,4,Bobby Louis,3315
4,5,Lisa Romero,6000
5,6,Anne Marry,8100
6,7,Kisa Salero,6985
7,8,Adam Smith,7450


In [4]:
q = """
    SELECT 
    DISTINCT(salary)
    FROM df
    ORDER BY salary DESC;
    """
a = ps.sqldf(q)
print(a)

   salary
0    8100
1    7790
2    7450
3    6985
4    6000
5    3315


In [5]:
q = """
    SELECT 
    DISTINCT(salary)
    FROM df
    ORDER BY salary DESC 
    LIMIT 1 OFFSET 4;
    """
a = ps.sqldf(q)
print(a)

   salary
0    6000


In [6]:
q = """
    SELECT 
    * 
    FROM df
    ORDER BY salary DESC
    LIMIT 1 OFFSET 4;
    """
a = ps.sqldf(q)
print(a)

   id         name  salary
0   3  Alice Jones    6000


In [7]:
q = """
    SELECT *, 
    rank() OVER (ORDER BY salary) as SRank,
    dense_rank() OVER (ORDER BY salary) as DRank
    FROM (SELECT DISTINCT salary FROM df) AS f;
    """
a = ps.sqldf(q)
print(a)

   salary  SRank  DRank
0    3315      1      1
1    6000      2      2
2    6985      3      3
3    7450      4      4
4    7790      5      5
5    8100      6      6


In [8]:
q = """
    SELECT 
    * 
    FROM df 
    WHERE salary = (
    SELECT DISTINCT(salary) FROM df ORDER BY salary DESC LIMIT 1 OFFSET 4);
    """
a = ps.sqldf(q)
print(a)

   id         name  salary
0   3  Alice Jones    6000
1   5  Lisa Romero    6000


In [12]:
q = """
    SELECT 
    DISTINCT(salary)
    FROM df
    ORDER BY salary DESC;
    """
a = ps.sqldf(q)
print(a)

   salary
0    8100
1    7790
2    7450
3    6985
4    6000
5    3315


### Example

In [12]:
eid  = [1,2,3,4,5,6,7,8]
name = ["Jone Doe","David Boe","Alice Jones","Bobby Louis","Lisa Romero","Anne Marry","Kisa Salero","Adam Smith"]
salary = [3315,7790,6000,3315,6000,8100,6985,7450]

df_users = pd.DataFrame(
    {"id":eid, 
     "name":name, 
     "salary":salary
    }
)

In [13]:
x1 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
x2 = [1,2,3,4,2,1,3,4,1,3,4,3,1,4,1]
x3 = [1,1,2,2,2,1,2,3,4,1,2,2,1,3,2]
x4 = ["2015-08-02","2015-08-03","2015-08-02","2015-08-04","2015-08-03",
      "2015-08-02","2015-08-04","2015-08-03","2015-08-03","2015-08-02",
      "2015-08-04","2015-08-02","2015-08-02","2015-08-03","2015-08-03"]

df_training = pd.DataFrame(
    {"user_training_id":x1,
     "user_id":x2, 
     "training_id":x3, 
     "training_date":x4
    }
)

In [14]:
print(df_training)

    user_training_id  user_id  training_id training_date
0                  1        1            1    2015-08-02
1                  2        2            1    2015-08-03
2                  3        3            2    2015-08-02
3                  4        4            2    2015-08-04
4                  5        2            2    2015-08-03
5                  6        1            1    2015-08-02
6                  7        3            2    2015-08-04
7                  8        4            3    2015-08-03
8                  9        1            4    2015-08-03
9                 10        3            1    2015-08-02
10                11        4            2    2015-08-04
11                12        3            2    2015-08-02
12                13        1            1    2015-08-02
13                14        4            3    2015-08-03
14                15        1            2    2015-08-03


In [15]:
q = """
    SELECT 
    user.name, trn.user_id, trn.training_date 
    FROM df_training trn
    JOIN df_users as user
    WHERE trn.user_id=user.id 
    AND training_date="2015-08-02"
    """
a = ps.sqldf(q)
print(a)

          name  user_id training_date
0     Jone Doe        1    2015-08-02
1  Alice Jones        3    2015-08-02
2     Jone Doe        1    2015-08-02
3  Alice Jones        3    2015-08-02
4  Alice Jones        3    2015-08-02
5     Jone Doe        1    2015-08-02


In [16]:
q = """
    SELECT user.name, trn.training_id, trn.training_date,
    COUNT(trn.training_id) as count
    FROM df_training trn
    JOIN df_users as user
    WHERE trn.user_id = user.id 
    GROUP BY user.name, trn.user_id, trn.training_id, trn.training_date 
    HAVING COUNT(trn.training_id) > 1
    ORDER BY trn.training_date DESC
    """
a = ps.sqldf(q)
print(a)

          name  training_id training_date  count
0  Bobby Louis            2    2015-08-04      2
1  Bobby Louis            3    2015-08-03      2
2  Alice Jones            2    2015-08-02      2
3     Jone Doe            1    2015-08-02      3


In [17]:
q = """
    SELECT * FROM 
    (SELECT training_date, user_id, count(*) as count
    FROM df_training
    GROUP BY training_date,user_id
    ORDER BY training_date,user_id) AS f1
    WHERE count > 1
    """
a = ps.sqldf(q)
print(a)

  training_date  user_id  count
0    2015-08-02        1      3
1    2015-08-02        3      3
2    2015-08-03        1      2
3    2015-08-03        2      2
4    2015-08-03        4      2
5    2015-08-04        4      2


In [18]:
q = """
    SELECT training_date, user_id, count(*) as count
    FROM df_training
    GROUP BY training_date,user_id
    HAVING count(*) > 1
    ORDER BY training_date,user_id
    """
a = ps.sqldf(q)
print(a)

  training_date  user_id  count
0    2015-08-02        1      3
1    2015-08-02        3      3
2    2015-08-03        1      2
3    2015-08-03        2      2
4    2015-08-03        4      2
5    2015-08-04        4      2


In [19]:
q = """
    WITH f1 AS
    (SELECT training_date, user_id, count(*) as count
    FROM df_training
    GROUP BY training_date,user_id
    ORDER BY training_date DESC,user_id)
    SELECT * FROM f1 WHERE count > 1
    """
a = ps.sqldf(q)
print(a)

  training_date  user_id  count
0    2015-08-04        4      2
1    2015-08-03        1      2
2    2015-08-03        2      2
3    2015-08-03        4      2
4    2015-08-02        1      3
5    2015-08-02        3      3


#### Example 

In [20]:
q = """
    SELECT * FROM 
    (SELECT *, 
    CASE 
    WHEN id%2=1 THEN 'Odd'
    WHEN id%2=0 THEN 'Evn'
    END 
    AS indicator
    FROM df) AS f1
    WHERE indicator='Odd'
    """
a = ps.sqldf(q)
print(a)

   id         name  salary indicator
0   1     Jone Doe    3315       Odd
1   3  Alice Jones    6000       Odd
2   5  Lisa Romero    6000       Odd
3   7  Kisa Salero    6985       Odd


#### Example : group_concat

In [21]:
# changing the max_rows value 
#pd.set_option("display.max_rows", 5) 
# changing the max_colwidth value 
pd.set_option("display.max_colwidth", 200) 

In [22]:
q = """
    SELECT group_concat(name,";") as concatName
    FROM df 
    """
a = ps.sqldf(q)
print(a)

                                                                                 concatName
0  Jone Doe;David Boe;Alice Jones;Bobby Louis;Lisa Romero;Anne Marry;Kisa Salero;Adam Smith


#### Example 

In [23]:
q = """
    SELECT id, name, salary
    FROM df
    ORDER BY 2 DESC
    """
a = ps.sqldf(q)
print(a)

   id         name  salary
0   5  Lisa Romero    6000
1   7  Kisa Salero    6985
2   1     Jone Doe    3315
3   2    David Boe    7790
4   4  Bobby Louis    3315
5   6   Anne Marry    8100
6   3  Alice Jones    6000
7   8   Adam Smith    7450


#### Example 

In [24]:
q = """
    SELECT 
    sum(1) as sumOne, 
    sum(2) as sumTwo,
    sum(3) as sumTwo
    FROM df
    """
a = ps.sqldf(q)
print(a)

   sumOne  sumTwo  sumTwo
0       8      16      24


#### Example 

In [25]:
tab = pd.DataFrame(
    {"id":[1,2,3],
     "c1":["Red",None,"Yellow"],
     "c2":["Yellow","Red",None],
     "c3":["Blue","Green","Violet"]
    }
)
print(tab)

   id      c1      c2      c3
0   1     Red  Yellow    Blue
1   2    None     Red   Green
2   3  Yellow    None  Violet


In [26]:
q = """
    SELECT id 
    FROM tab
    WHERE c1="Yellow" OR c2="Yellow" OR c3="Yellow"
    """
a = ps.sqldf(q)
print(a)

   id
0   1
1   3


In [27]:
q = """
    SELECT id 
    FROM tab
    WHERE "Yellow" in (c1,c2,c3)
    """
a = ps.sqldf(q)
print(a)

   id
0   1
1   3


#### Example

In [28]:
tab = pd.DataFrame({"col" : ['d', 'x', 'T', 8, 'a', 9, 6, 2, 'V']})

In [29]:
q = """
    SELECT col,
    CASE 
    WHEN TYPEOF(col)="text" AND LOWER(col) IN ('a','d','x','t','v') THEN "BUZZ"
    ELSE "FIZZ"
    END AS convertText
    FROM tab;
    """
a = ps.sqldf(q)
print(a)

  col convertText
0   d        BUZZ
1   x        BUZZ
2   T        BUZZ
3   8        FIZZ
4   a        BUZZ
5   9        FIZZ
6   6        FIZZ
7   2        FIZZ
8   V        BUZZ


### Example

In [30]:
tab = pd.DataFrame({"col" : [2.1, -2.3, 4.5, -4, -3, 0.9, 2]})

In [31]:
q = """
    WITH 
    pos AS (SELECT SUM(col) AS posSum FROM tab WHERE col>0),
    neg AS (SELECT SUM(col) AS negSum FROM tab WHERE col<0)
    SELECT posSum, negSum from pos, neg;
    """
a = ps.sqldf(q)
print(a)

   posSum  negSum
0     9.5    -9.3


In [32]:
q = """ 
    SELECT 
    SUM(CASE 
    WHEN col>0 THEN col
    ELSE 0 
    END) AS posSum, 
    SUM(CASE 
    WHEN col<0 THEN col
    ELSE 0 
    END) AS negSum     
    FROM tab
    """
a = ps.sqldf(q)
print(a)

   posSum  negSum
0     9.5    -9.3


### Example

In [33]:
q = """
    SELECT col, 
    CAST(col AS INT) colInt, 
    CAST(col*100 AS INT) colMul
    FROM tab
    """
a = ps.sqldf(q)
print(a)

   col  colInt  colMul
0  2.1       2     210
1 -2.3      -2    -229
2  4.5       4     450
3 -4.0      -4    -400
4 -3.0      -3    -300
5  0.9       0      90
6  2.0       2     200


### Example : Self-Join

In [34]:
empid     = [1,2,3,4,5,6,7,8,9,10]
empname   = ["Jone","David","Alice","Bobby","Lisa","Anne","Romero","Adam","Nery","Joe"]
empsalary = [4765,8100,5635,3315,6000,7495,6985,6370,5150,3870]
managerid = [8,10,8,7,8,10,None,10,10,7]

tab = pd.DataFrame(
    {"EmpID": empid, 
     "EmpName": empname, 
     "EmpSalary": empsalary, 
     "ManagerID": managerid
    }
)

In [35]:
q = """
    SELECT f.ManagerID, EmpName, f.avgSalary
    FROM tab
    JOIN
    (SELECT CAST(ManagerID AS INT) managerid, CAST(avg(EmpSalary) AS INT) as avgSalary
    FROM tab
    WHERE ManagerID IS NOT NULL
    GROUP BY ManagerID) as f
    ON tab.EmpID = f.ManagerID
    """
a = ps.sqldf(q)
print(a)

   managerid EmpName  avgSalary
0          7  Romero       3592
1          8    Adam       5466
2         10     Joe       6778
