# Pandas SQL : N-th Largest Without Bucket

Case 1 : There are no Duplicate Records

Case 2 : There are Duplicate Records

In [1]:
import numpy as np
import pandas as pd
import pandasql as ps

## Data

In [2]:
df_salary = pd.DataFrame(
    {"id":[1,2,3,4,5,6,7,8,9,10],
     "name":["Jone Doe","David Boe","Alice Jones","Bobby Louis","Lisa Romero",
             "Anne Marry","Lisa Romero","Adam Smith","Nery Garcia","Joe Capella"],
     "salUnique":[4765,8100,5635,3315,6000,7495,6985,6370,5150,3870],
     "salDupFirst":[3315,8100,6000,3315,4625,7495,6985,6370,5150,3870],
     "salDupFirstMid":[3315,8100,6000,3315,6000,7495,6985,6370,5150,3870],
     "salDupFirstMidLast":[3315,8100,6000,3315,6000,8100,6985,6370,5150,3870]
    }
)

In [3]:
print(df_salary.head(3))

   id         name  salUnique  salDupFirst  salDupFirstMid  salDupFirstMidLast
0   1     Jone Doe       4765         3315            3315                3315
1   2    David Boe       8100         8100            8100                8100
2   3  Alice Jones       5635         6000            6000                6000


-------------------------------------------------------------------------

## Without Window Function

### Case 1
#### When There are no Duplicate Salaries

In [4]:
q = """
    SELECT 
    id, name, salUnique as salary
    FROM df_salary
    ORDER BY salUnique
    """
a = ps.sqldf(q)
print(a)

   id         name  salary
0   4  Bobby Louis    3315
1  10  Joe Capella    3870
2   1     Jone Doe    4765
3   9  Nery Garcia    5150
4   3  Alice Jones    5635
5   5  Lisa Romero    6000
6   8   Adam Smith    6370
7   7  Lisa Romero    6985
8   6   Anne Marry    7495
9   2    David Boe    8100


### Case 2
#### When There are Duplicate Salaries

Case 1: Find Distinct Salaries in Ascending Order

In [5]:
q = """
    SELECT 
    DISTINCT salDupFirstMid 
    FROM df_salary
    ORDER BY salDupFirst
    LIMIT 4
    """
a = ps.sqldf(q)
print(a)

   salDupFirstMid
0            3315
1            3870
2            5150
3            6000


Case 2: Find Maximum of (Distinct Salaries in Ascending Order)

In [6]:
q = """
    SELECT 
    max(salDupFirstMid) as minSal
    FROM
    (SELECT DISTINCT salDupFirstMid
    FROM df_salary
    ORDER BY salDupFirstMid 
    LIMIT 4)
    """
a = ps.sqldf(q)
print(a)

   minSal
0    6000


Case 3: Find All Columns of the Maximum of (Distinct Salaries in Ascending Order)

In [7]:
q = """
    SELECT 
    * FROM df_salary 
    WHERE salDupFirstMid = 
    (SELECT max(salDupFirstMid) as minSal
    FROM
    (SELECT DISTINCT salDupFirstMid
    FROM df_salary
    ORDER BY salDupFirstMid
    LIMIT 4))
    """
a = ps.sqldf(q)
print(a)

   id         name  salUnique  salDupFirst  salDupFirstMid  salDupFirstMidLast
0   3  Alice Jones       5635         6000            6000                6000
1   5  Lisa Romero       6000         4625            6000                6000


----------------------------------------------------------------------------

## With Window Function

### Case 1
#### When There are no Duplicate Salaries

In [8]:
q = """
    WITH sr AS
    (SELECT 
    id, name, salUnique as salary,
    rank() OVER (ORDER BY salUnique) as SRank
    FROM df_salary),
    dr AS 
    (SELECT 
    id, salUnique,
    dense_rank() OVER (ORDER BY salUnique) as DRank
    FROM df_salary)
    SELECT 
    sr.id, sr.name, sr.salary, sr.SRank, dr.DRank
    FROM sr, dr 
    ON sr.id = dr.id
    """
a = ps.sqldf(q)
print(a)

   id         name  salary  SRank  DRank
0   4  Bobby Louis    3315      1      1
1  10  Joe Capella    3870      2      2
2   1     Jone Doe    4765      3      3
3   9  Nery Garcia    5150      4      4
4   3  Alice Jones    5635      5      5
5   5  Lisa Romero    6000      6      6
6   8   Adam Smith    6370      7      7
7   7  Lisa Romero    6985      8      8
8   6   Anne Marry    7495      9      9
9   2    David Boe    8100     10     10


### Case 2
#### When There are Duplicate Salaries

In [9]:
"""Case 2a"""
q = """
    WITH sr AS
    (SELECT id, name, salDupFirst as salary,
    rank() OVER (ORDER BY salDupFirst) as SRank
    FROM df_salary),
    dr AS 
    (SELECT id,
    dense_rank() OVER (ORDER BY salDupFirst) as DRank
    FROM df_salary)
    SELECT sr.id, sr.name, salary, sr.SRank, dr.DRank
    FROM sr, dr 
    ON sr.id = dr.id
    """
a = ps.sqldf(q)
print(a)

   id         name  salary  SRank  DRank
0   1     Jone Doe    3315      1      1
1   4  Bobby Louis    3315      1      1
2  10  Joe Capella    3870      3      2
3   5  Lisa Romero    4625      4      3
4   9  Nery Garcia    5150      5      4
5   3  Alice Jones    6000      6      5
6   8   Adam Smith    6370      7      6
7   7  Lisa Romero    6985      8      7
8   6   Anne Marry    7495      9      8
9   2    David Boe    8100     10      9


In [10]:
"""Case 2b"""
q = """
    WITH sr AS
    (SELECT id, name, salDupFirstMid as salary,
    rank() OVER (ORDER BY salDupFirstMid) as SRank
    FROM df_salary),
    dr AS 
    (SELECT id,
    dense_rank() OVER (ORDER BY salDupFirstMid) as DRank
    FROM df_salary)
    SELECT sr.id, sr.name, sr.salary, sr.SRank, dr.DRank
    FROM sr, dr 
    ON sr.id = dr.id
    """
a = ps.sqldf(q)
print(a)

   id         name  salary  SRank  DRank
0   1     Jone Doe    3315      1      1
1   4  Bobby Louis    3315      1      1
2  10  Joe Capella    3870      3      2
3   9  Nery Garcia    5150      4      3
4   3  Alice Jones    6000      5      4
5   5  Lisa Romero    6000      5      4
6   8   Adam Smith    6370      7      5
7   7  Lisa Romero    6985      8      6
8   6   Anne Marry    7495      9      7
9   2    David Boe    8100     10      8


In [11]:
"""Case 2c"""
q = """
    WITH sr AS
    (SELECT id, name, salDupFirstMidLast as salary,
    rank() OVER (ORDER BY salDupFirstMidLast) as SRank
    FROM df_salary),
    dr AS 
    (SELECT id,
    dense_rank() OVER (ORDER BY salDupFirstMidLast) as DRank
    FROM df_salary)
    SELECT sr.id, sr.name, sr.salary, sr.SRank, dr.DRank
    FROM sr, dr 
    ON sr.id = dr.id
    """
a = ps.sqldf(q)
print(a)

   id         name  salary  SRank  DRank
0   1     Jone Doe    3315      1      1
1   4  Bobby Louis    3315      1      1
2  10  Joe Capella    3870      3      2
3   9  Nery Garcia    5150      4      3
4   3  Alice Jones    6000      5      4
5   5  Lisa Romero    6000      5      4
6   8   Adam Smith    6370      7      5
7   7  Lisa Romero    6985      8      6
8   2    David Boe    8100      9      7
9   6   Anne Marry    8100      9      7


In [12]:
q = """
    SELECT *, 
    rank() OVER (ORDER BY salary) as SRank
    FROM( SELECT DISTINCT salDupFirstMidLast as salary FROM df_salary ) AS f
    """
a = ps.sqldf(q)
print(a)

   salary  SRank
0    3315      1
1    3870      2
2    5150      3
3    6000      4
4    6370      5
5    6985      6
6    8100      7


In [13]:
q = """
    SELECT salary 
    FROM (SELECT *, 
    rank() OVER (ORDER BY salary) as SRank
    FROM( SELECT DISTINCT salDupFirstMidLast as salary FROM df_salary) AS f1) AS f2
    WHERE SRank=4
    """
a = ps.sqldf(q)
print(a)

   salary
0    6000


In [14]:
q = """
    SELECT * FROM df_salary
    WHERE salDupFirstMidLast =
    (SELECT salary 
    FROM (SELECT *, 
    rank() OVER (ORDER BY salary) as SRank
    FROM( SELECT DISTINCT salDupFirstMidLast as salary FROM df_salary) AS f1) AS f2
    WHERE SRank=4)
    """
a = ps.sqldf(q)
print(a)

   id         name  salUnique  salDupFirst  salDupFirstMid  salDupFirstMidLast
0   3  Alice Jones       5635         6000            6000                6000
1   5  Lisa Romero       6000         4625            6000                6000


### Practice

In [15]:
data ={
   "dept" : ['A','A','A','B','B','B','C','C'],
    "salary" :[1500,1700,1000,5000,5000,3100,4000,4500]
}
df = pd.DataFrame(data)

In [16]:
q = """
    SELECT 
    dept,  
    salary
    FROM df
    ORDER BY salary DESC
    """
a = ps.sqldf(q)
print(a)

  dept  salary
0    B    5000
1    B    5000
2    C    4500
3    C    4000
4    B    3100
5    A    1700
6    A    1500
7    A    1000


Max salary in the table

In [17]:
q = """
    SELECT 
    dept,  
    salary
    FROM df
    ORDER BY salary DESC
    LIMIT 1 OFFSET 0
    """
a = ps.sqldf(q)
print(a)

  dept  salary
0    B    5000


In [18]:
q = """
    SELECT 
    dept,  
    MAX(salary) as max_salary,
    AVG(salary) as avg_salary
    FROM df
    GROUP BY dept
    """
a = ps.sqldf(q)
print(a)

  dept  max_salary   avg_salary
0    A        1700  1400.000000
1    B        5000  4366.666667
2    C        4500  4250.000000


In [19]:
q = """
    SELECT 
    dept,  
    salary
    FROM df
    GROUP BY dept
    """
a = ps.sqldf(q)
print(a)

  dept  salary
0    A    1500
1    B    5000
2    C    4000


In [20]:
q = """
    SELECT 
    dept,  
    salary,
    dense_rank() OVER (PARTITION BY dept ORDER BY salary DESC) AS salary_rank
    FROM df
    """
a = ps.sqldf(q)
print(a)

  dept  salary  salary_rank
0    A    1700            1
1    A    1500            2
2    A    1000            3
3    B    5000            1
4    B    5000            1
5    B    3100            2
6    C    4500            1
7    C    4000            2


In [21]:
q = """
    WITH f1 AS
    (SELECT 
    dept,  
    salary,
    dense_rank() OVER (PARTITION BY dept ORDER BY salary DESC) AS salary_rank
    FROM df)
    SELECT * FROM f1
    WHERE salary_rank=1
    """
a = ps.sqldf(q)
print(a)

  dept  salary  salary_rank
0    A    1700            1
1    B    5000            1
2    B    5000            1
3    C    4500            1
