# Pandas SQL 

Find the N-th Largest Value in a Group

Find Row Values greater than the Average Row Value in a Group

In [1]:
import numpy as np
import pandas as pd
import pandasql as ps

### Data

In [2]:
empID  = [x+1 for x in range(12)]
empName= ["nurur","tom","peter","bo","binbin","jack","sid","don","rahman","kyle","sohel","robin"]
managerID = [-99,1,1,3,3,8,8,-99,6,10,10,-99]
deptName = ["AA","AA","AA","AA","AA","IT","IT","IT","IT","HR","HR","BD"]
salary = [41000,29000,33000,21000,21000,33000,29000,24000,26500,19000,13700,39000]
df = pd.DataFrame({"EmpID": empID, "EmpName":empName, "ManagerID": managerID, 
                   "EmpDept":deptName,"EmpSalary": salary})

In [3]:
print( df.head(2) )

   EmpID EmpName  ManagerID EmpDept  EmpSalary
0      1   nurur        -99      AA      41000
1      2     tom          1      AA      29000


In [4]:
a = ps.sqldf("""SELECT * FROM df ORDER BY EmpSalary DESC LIMIT 2""")
print(a)

   EmpID EmpName  ManagerID EmpDept  EmpSalary
0      1   nurur        -99      AA      41000
1     12   robin        -99      BD      39000


### Example : The N-th Largest Value in a Group
#### 1. Highest Salary by Dept

In [5]:
a=ps.sqldf("""SELECT EmpDept, max(EmpSalary) FROM df GROUP BY EmpDept""")
print(a)

  EmpDept  max(EmpSalary)
0      AA           41000
1      BD           39000
2      HR           19000
3      IT           33000


#### 2 : Second Highest Salary by Dept: Wrong

In [6]:
q = """
    SELECT 
    EmpID, EmpName, EmpDept, 
    max(EmpSalary) FROM df 
    WHERE EmpSalary NOT IN (SELECT max(EmpSalary) FROM df GROUP BY EmpDept) 
    GROUP BY EmpDept
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName EmpDept  max(EmpSalary)
0      2     tom      AA           29000
1     11   sohel      HR           13700
2      7     sid      IT           29000


Second Highest Salary by Dept: Correct

In [7]:
q = """
    SELECT EmpID, EmpName, EmpDept, 
    max(EmpSalary) FROM df 
    WHERE (EmpDept,EmpSalary) NOT IN (SELECT EmpDept, max(EmpSalary) FROM df GROUP BY EmpDept) 
    GROUP BY EmpDept"""
a = ps.sqldf(q)
print(a)

   EmpID EmpName EmpDept  max(EmpSalary)
0      3   peter      AA           33000
1     11   sohel      HR           13700
2      7     sid      IT           29000


#### 3 : N-th Highest Salary by Dept: General

In [8]:
q = """
    SELECT * FROM 
    (SELECT *, dense_rank() OVER (PARTITION BY EmpDept ORDER BY EmpSalary DESC) as SalaryRank 
    FROM df) as f 
    """
a = ps.sqldf(q)
print(a)

    EmpID EmpName  ManagerID EmpDept  EmpSalary  SalaryRank
0       1   nurur        -99      AA      41000           1
1       3   peter          1      AA      33000           2
2       2     tom          1      AA      29000           3
3       4      bo          3      AA      21000           4
4       5  binbin          3      AA      21000           4
5      12   robin        -99      BD      39000           1
6      10    kyle         10      HR      19000           1
7      11   sohel         10      HR      13700           2
8       6    jack          8      IT      33000           1
9       7     sid          8      IT      29000           2
10      9  rahman          6      IT      26500           3
11      8     don        -99      IT      24000           4


In [9]:
q = """
    SELECT * FROM 
    (SELECT *, dense_rank() OVER (PARTITION BY EmpDept ORDER BY EmpSalary DESC) as SalaryRank 
    FROM df) as f 
    WHERE SalaryRank == 1
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID EmpDept  EmpSalary  SalaryRank
0      1   nurur        -99      AA      41000           1
1     12   robin        -99      BD      39000           1
2     10    kyle         10      HR      19000           1
3      6    jack          8      IT      33000           1


In [10]:
q = """
    SELECT * FROM 
    (SELECT *, dense_rank() OVER (PARTITION BY EmpDept ORDER BY EmpSalary DESC) as SalaryRank 
    FROM df) as f 
    WHERE SalaryRank == 2
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID EmpDept  EmpSalary  SalaryRank
0      3   peter          1      AA      33000           2
1     11   sohel         10      HR      13700           2
2      7     sid          8      IT      29000           2


In [11]:
q = """
    SELECT * FROM 
    (SELECT *, dense_rank() OVER (PARTITION BY EmpDept ORDER BY EmpSalary DESC) as SalaryRank 
    FROM df) as f 
    WHERE SalaryRank == 3
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID EmpDept  EmpSalary  SalaryRank
0      2     tom          1      AA      29000           3
1      9  rahman          6      IT      26500           3


In [12]:
a = df.nlargest(5, columns='EmpSalary')
print(a)

    EmpID EmpName  ManagerID EmpDept  EmpSalary
0       1   nurur        -99      AA      41000
11     12   robin        -99      BD      39000
2       3   peter          1      AA      33000
5       6    jack          8      IT      33000
1       2     tom          1      AA      29000


In [13]:
a = df.nlargest(5, columns='EmpSalary').tail(2)
print(a)

   EmpID EmpName  ManagerID EmpDept  EmpSalary
5      6    jack          8      IT      33000
1      2     tom          1      AA      29000


### Example : Salary greater than Average Salary by Dept 

In [14]:
q = """
    SELECT *, avg(EmpSalary) OVER (PARTITION BY EmpDept) as AvgSalary FROM df
    """
a = ps.sqldf(q)
print(a)

    EmpID EmpName  ManagerID EmpDept  EmpSalary  AvgSalary
0       1   nurur        -99      AA      41000    29000.0
1       2     tom          1      AA      29000    29000.0
2       3   peter          1      AA      33000    29000.0
3       4      bo          3      AA      21000    29000.0
4       5  binbin          3      AA      21000    29000.0
5      12   robin        -99      BD      39000    39000.0
6      10    kyle         10      HR      19000    16350.0
7      11   sohel         10      HR      13700    16350.0
8       6    jack          8      IT      33000    28125.0
9       7     sid          8      IT      29000    28125.0
10      8     don        -99      IT      24000    28125.0
11      9  rahman          6      IT      26500    28125.0


In [15]:
q = """
    SELECT * FROM 
    (SELECT *, avg(EmpSalary) OVER (PARTITION BY EmpDept) as AvgSalary
    FROM df) as f 
    WHERE EmpSalary > AvgSalary
    """
a = ps.sqldf(q)
print(a)

   EmpID EmpName  ManagerID EmpDept  EmpSalary  AvgSalary
0      1   nurur        -99      AA      41000    29000.0
1      3   peter          1      AA      33000    29000.0
2     10    kyle         10      HR      19000    16350.0
3      6    jack          8      IT      33000    28125.0
4      7     sid          8      IT      29000    28125.0
