# More SQL functionality

In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
pd.options.display.max_rows = 200

I've made a new database called tutorial and user with full privileges on that database in MySQL.

In [3]:
disk_engine = create_engine('mysql+pymysql://isaac:asimov@localhost/tutorial')

In [4]:
disk_engine

Engine(mysql+pymysql://isaac:***@localhost/tutorial)

In [5]:
df_p = pd.read_csv('./data/tips.csv')
df_p.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
df_p.to_sql('tips', disk_engine, if_exists='replace', index=False)

In [7]:
df_sql = pd.read_sql_query('SELECT * FROM tips', disk_engine)
df_sql.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Let's try JOIN

In [8]:
math = pd.DataFrame({'name':['A', 'B', 'C', 'D', 'E'],
                     'math_grade':[1.,2.,3.,4.,5.]})
physics = pd.DataFrame({'name':['A', 'B', 'C', 'D', 'E'],
                        'physics_grade':[2.,3.,4.,5.,np.nan]})

math.to_sql('math', disk_engine, if_exists='replace', index=False)
physics.to_sql('physics', disk_engine, if_exists='replace', index=False)

In [9]:
trans = pd.DataFrame({'date': [1,2,3,4,5],
                      'trans_id': ['t1','t2','t3','t4','t5'],
                      'item': ['A', 'B', 'C', 'A', 'A'],
                      'quantity': [1,1,2,1,1]})
items = pd.DataFrame({'item':['A', 'B', 'C', 'D', 'E'],
                      'price':[10,20,30,40,50]})

trans.to_sql('trans', disk_engine, if_exists='replace', index=False)
items.to_sql('items', disk_engine, if_exists='replace', index=False)

In [10]:
trans

Unnamed: 0,date,item,quantity,trans_id
0,1,A,1,t1
1,2,B,1,t2
2,3,C,2,t3
3,4,A,1,t4
4,5,A,1,t5


In [11]:
items

Unnamed: 0,item,price
0,A,10
1,B,20
2,C,30
3,D,40
4,E,50


For example, show items that never been sold. We need a union of key and see the items that don't have trans_id

In [12]:
query = """
SELECT *
FROM trans
RIGHT OUTER JOIN items
ON trans.item = items.item
"""

pd.read_sql_query(query, disk_engine)

Unnamed: 0,date,item,quantity,trans_id,item.1,price
0,1.0,A,1.0,t1,A,10
1,2.0,B,1.0,t2,B,20
2,3.0,C,2.0,t3,C,30
3,4.0,A,1.0,t4,A,10
4,5.0,A,1.0,t5,A,10
5,,,,,D,40
6,,,,,E,50


## LET's try using GoJek employee and reviews data

In [23]:
employee = pd.read_csv('data/prob_c_employee.csv')
employee

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary
0,Bob,Smith,1,2009-06-20,2016-01-01,10000
1,Joe,Jarrod,2,2010-02-12,,20000
2,Nancy,Soley,3,2012-03-14,,30000
3,Keith,Widjaja,4,2013-09-10,2014-01-01,20000
4,Kelly,Smalls,5,2013-09-10,,20000
5,Frank,Nguyen,6,2015-04-10,2015-05-01,60000
6,A,Smith A,7,2015-04-9,2015-0500,60001
7,B,Smith B,8,2015-04-8,2015-0501,60002


In [24]:
reviews = pd.read_csv('data/prob_c_reviews.csv')
reviews

Unnamed: 0,ID,EmpID,ReviewDate
0,10,1,2016-01-01
1,20,2,2016-04-12
2,30,10,2015-02-13
3,40,22,2010-10-12
4,50,11,2009-01-01
5,60,12,2009-03-03
6,70,13,2008-12-01
7,80,1,2003-04-12
8,90,1,2014-04-30


Put them into the database

In [25]:
employee.to_sql('employee', disk_engine, if_exists='replace', index=False)
reviews.to_sql('reviews', disk_engine, if_exists='replace', index=False)

In [56]:
query =r'''
SELECT *
FROM employee
WHERE LastName LIKE "Smith%%"
ORDER BY LastName, FirstName
'''


df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary
0,Bob,Smith,1,2009-06-20,2016-01-01,10000


In [57]:
query = """
SELECT *
FROM employee
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary
0,Bob,Smith,1,2009-06-20,2016-01-01,10000
1,Joe,Jarrod,2,2010-02-12,,20000
2,Nancy,Soley,3,2012-03-14,,30000
3,Keith,Widjaja,4,2013-09-10,2014-01-01,20000
4,Kelly,Smalls,5,2013-09-10,,20000
5,Frank,Nguyen,6,2015-04-10,2015-05-01,60000


In [33]:
query

"\nSELECT *\nFROM employee\nWHERE LastName = 'Smith%'\nORDER BY LastName, FirstName\n"

In [61]:
query = """
SELECT *
FROM employee
LEFT OUTER JOIN reviews
ON employee.ID = reviews.EmpID
"""

pd.read_sql_query(query, disk_engine)

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,ID.1,EmpID,ReviewDate
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,10.0,1.0,2016-01-01
1,Joe,Jarrod,2,2010-02-12,,20000,20.0,2.0,2016-04-12
2,Bob,Smith,1,2009-06-20,2016-01-01,10000,80.0,1.0,2003-04-12
3,Bob,Smith,1,2009-06-20,2016-01-01,10000,90.0,1.0,2014-04-30
4,Nancy,Soley,3,2012-03-14,,30000,,,
5,Keith,Widjaja,4,2013-09-10,2014-01-01,20000,,,
6,Kelly,Smalls,5,2013-09-10,,20000,,,
7,Frank,Nguyen,6,2015-04-10,2015-05-01,60000,,,


In [16]:
query = """
SELECT DATEDIFF(MAX(HireDate), MIN(HireDate)) dayDifference
FROM employee
WHERE TerminationDate IS NULL
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,dayDifference
0,1306


In [17]:
query = """
SELECT HireDate FROM employee 
UNION 
SELECT TerminationDate FROM employee WHERE TerminationDate IS NOT NULL
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,HireDate
0,2009-06-20
1,2010-02-12
2,2012-03-14
3,2013-09-10
4,2015-04-10
5,2016-01-01
6,2014-01-01
7,2015-05-01


In [140]:
query = """
SELECT activityTs
FROM
(SELECT HireDate activityTs
 FROM employee 
 UNION 
 SELECT TerminationDate FROM employee WHERE TerminationDate IS NOT NULL ORDER BY activityTs)t

"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,activityTs
0,2009-06-20
1,2010-02-12
2,2012-03-14
3,2013-09-10
4,2014-01-01
5,2015-04-10
6,2015-05-01
7,2016-01-01


# I need answer for number 4. So need to experiment with some questions in stackoverflows

First https://stackoverflow.com/questions/20661853/calculate-the-time-difference-between-rows-in-mysql

In [19]:
tbl_speed = pd.DataFrame({'ID': [1,2,3,4],
                          'time': [
                              '2013-12-17 14:14:04',
                              '2013-12-17 14:14:09',
                              '2013-12-17 14:14:15',
                              '2013-12-17 14:28:52',
                          ]
                         })

In [20]:
tbl_speed.head()

Unnamed: 0,ID,time
0,1,2013-12-17 14:14:04
1,2,2013-12-17 14:14:09
2,3,2013-12-17 14:14:15
3,4,2013-12-17 14:28:52


In [21]:
tbl_speed.to_sql('tbl_speed', disk_engine, if_exists='replace', index=False)

In [22]:
query = """
SELECT *
FROM tbl_speed
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,ID,time
0,1,2013-12-17 14:14:04
1,2,2013-12-17 14:14:09
2,3,2013-12-17 14:14:15
3,4,2013-12-17 14:28:52


In [115]:
query = """
SELECT ID, time,
 TIMESTAMPDIFF(SECOND, (SELECT MAX(time) FROM tbl_speed WHERE time<t.time), t.time) secdiff
FROM tbl_speed AS t
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,ID,time,secdiff
0,1,2013-12-17 14:14:04,
1,2,2013-12-17 14:14:09,5.0
2,3,2013-12-17 14:14:15,6.0
3,4,2013-12-17 14:28:52,877.0


Or more precise

In [114]:
query = """
SELECT t.ID, t.time,
 TIMESTAMPDIFF(SECOND, (SELECT MAX(tbl_speed.time) FROM tbl_speed WHERE tbl_speed.time<t.time), t.time) secdiff
FROM tbl_speed AS t
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,ID,time,secdiff
0,1,2013-12-17 14:14:04,
1,2,2013-12-17 14:14:09,5.0
2,3,2013-12-17 14:14:15,6.0
3,4,2013-12-17 14:28:52,877.0


Let's try to break it down. To illustrate what happen, I need to make two identical tables. t1 and t2

In [125]:
t1 = pd.DataFrame({'id':[1,2,3,4],
                   'time':[
                       '2013-01-15',
                       '2013-01-17',
                       '2013-01-20',
                       '2013-01-24',
                   ]
                  })


t2 = pd.DataFrame({'id':[1,2,3,4],
                   'time':[
                       '2013-01-15',
                       '2013-01-17',
                       '2013-01-20',
                       '2013-01-24',
                   ]
                  })

In [126]:
t1.to_sql('t1', disk_engine, if_exists='replace', index=False)
t2.to_sql('t2', disk_engine, if_exists='replace', index=False)

In [127]:
query = """
SELECT t1.id, t1.time,
 TIMESTAMPDIFF(DAY, (SELECT MAX(t2.time) FROM t2 WHERE t2.time < t1.time), t1.time) AS diff
FROM t1
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,id,time,diff
0,1,2013-01-15,
1,2,2013-01-17,2.0
2,3,2013-01-20,3.0
3,4,2013-01-24,4.0


In [138]:
query = """
SELECT
 TIMESTAMPDIFF(DAY, (SELECT MAX(t2.time) FROM t2 WHERE t2.time < t1.time), t1.time) AS diff
FROM t1
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,diff
0,
1,2.0
2,3.0
3,4.0


In [136]:
query = """
SELECT MAX(diff)
FROM
(SELECT TIMESTAMPDIFF(DAY, (SELECT MAX(t2.time) FROM t2 WHERE t2.time < t1.time), t1.time) AS diff
 FROM t1) x
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,MAX(diff)
0,4


In [128]:
query = """
SELECT MAX(t2.time)
FROM t2 WHERE t2.time < t1.time
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,MAX(t2.time)
0,2013-01-24


For our case

In [142]:
query = """
SELECT MAX(activityDiff)

FROM
    (
        SELECT TIMESTAMPDIFF(DAY, (SELECT MAX(t2.activityTs) 
                                   FROM (
                                        SELECT HireDate activityTs
                                        FROM employee 
                                        UNION 
                                        SELECT TerminationDate FROM employee WHERE TerminationDate IS NOT NULL ORDER BY activityTs
                                        ) AS t2
                                   WHERE t2.activityTs < t1.activityTs), t1.activityTs
                            ) activityDiff
        
        FROM (
            SELECT HireDate activityTs
            FROM employee 
            UNION 
            SELECT TerminationDate FROM employee WHERE TerminationDate IS NOT NULL ORDER BY activityTs
             ) AS t1
 
     ) AS t_activityDiff
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,MAX(activityDiff)
0,761


In [143]:
query = """
SELECT activityDiff

FROM
    (
        SELECT TIMESTAMPDIFF(DAY, (SELECT MAX(t2.activityTs) 
                                   FROM (
                                        SELECT HireDate activityTs
                                        FROM employee 
                                        UNION 
                                        SELECT TerminationDate FROM employee WHERE TerminationDate IS NOT NULL ORDER BY activityTs
                                        ) AS t2
                                   WHERE t2.activityTs < t1.activityTs), t1.activityTs
                            ) activityDiff
        
        FROM (
            SELECT HireDate activityTs
            FROM employee 
            UNION 
            SELECT TerminationDate FROM employee WHERE TerminationDate IS NOT NULL ORDER BY activityTs
             ) AS t1
 
     ) AS t_activityDiff
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,activityDiff
0,
1,237.0
2,761.0
3,545.0
4,113.0
5,464.0
6,21.0
7,245.0


In [157]:
query = """
 SELECT x.date, MIN(y.date) y_date,DATEDIFF(MIN(y.date),x.date) days
FROM
(
SELECT HireDate date FROM employee
UNION 
SELECT TerminationDate FROM employee
) x
JOIN
(
SELECT HireDate date FROM employee
UNION 
SELECT TerminationDate FROM employee
UNION
SELECT CURDATE())
y
ON y.date > x.date
GROUP BY x.date
ORDER BY days DESC LIMIT 1;
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,date,y_date,days
0,2010-02-12,2012-03-14,761


# Again using the same tables, write a query that returns each employee and for each row/employee include the greatest number of employees that worked for the company at any time during their tenure and the first date that maximum was reached. Extra points for not using cursors.

In [144]:
query = """
SELECT *
FROM employee
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary
0,Bob,Smith,1,2009-06-20,2016-01-01,10000
1,Joe,Jarrod,2,2010-02-12,,20000
2,Nancy,Soley,3,2012-03-14,,30000
3,Keith,Widjaja,4,2013-09-10,2014-01-01,20000
4,Kelly,Smalls,5,2013-09-10,,20000
5,Frank,Nguyen,6,2015-04-10,2015-05-01,60000


In [206]:
query = """
 SELECT t.ts     AS as_of
      , COUNT(1) AS emp_count
   FROM employee e
   JOIN ( SELECT t.TerminationDate AS ts
            FROM employee t
           WHERE t.TerminationDate IS NOT NULL
           GROUP BY t.TerminationDate
           UNION
          SELECT h.HireDate AS ts
            FROM employee h
           WHERE h.HireDate IS NOT NULL
           GROUP BY h.HireDate
        ) t
     ON ( t.ts >= e.HireDate )
    AND ( t.ts <  e.TerminationDate OR e.TerminationDate IS NULL)
  GROUP BY t.ts
"""
df_sql = pd.read_sql_query(query, disk_engine)
employee_count = df_sql
df_sql

Unnamed: 0,as_of,emp_count
0,2009-06-20,1
1,2010-02-12,2
2,2012-03-14,3
3,2013-09-10,5
4,2014-01-01,4
5,2015-04-10,5
6,2015-05-01,4
7,2016-01-01,3


In [207]:
employee_count.to_sql('employee_count', disk_engine, if_exists='replace', index=False)

In [208]:
query = """
SELECT *
FROM employee_count
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,as_of,emp_count
0,2009-06-20,1
1,2010-02-12,2
2,2012-03-14,3
3,2013-09-10,5
4,2014-01-01,4
5,2015-04-10,5
6,2015-05-01,4
7,2016-01-01,3


Try to Join

In [220]:
query = """
SELECT *
FROM employee INNER JOIN employee_count
ORDER BY ID
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2009-06-20,1
1,Bob,Smith,1,2009-06-20,2016-01-01,10000,2010-02-12,2
2,Bob,Smith,1,2009-06-20,2016-01-01,10000,2012-03-14,3
3,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
4,Bob,Smith,1,2009-06-20,2016-01-01,10000,2014-01-01,4
5,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-04-10,5
6,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-05-01,4
7,Bob,Smith,1,2009-06-20,2016-01-01,10000,2016-01-01,3
8,Joe,Jarrod,2,2010-02-12,,20000,2009-06-20,1
9,Joe,Jarrod,2,2010-02-12,,20000,2010-02-12,2


Do filter on the table above so tha it only display every row that meets this condition: `HireDate < as_of < TerminationDate`

In [255]:
query = """
SELECT *
FROM
(
 SELECT *
 FROM employee INNER JOIN employee_count
 ORDER BY ID
) x
WHERE as_of >= HireDate AND (as_of <= TerminationDate OR TerminationDate IS NULL)
ORDER BY ID
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2009-06-20,1
1,Bob,Smith,1,2009-06-20,2016-01-01,10000,2010-02-12,2
2,Bob,Smith,1,2009-06-20,2016-01-01,10000,2012-03-14,3
3,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
4,Bob,Smith,1,2009-06-20,2016-01-01,10000,2014-01-01,4
5,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-04-10,5
6,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-05-01,4
7,Bob,Smith,1,2009-06-20,2016-01-01,10000,2016-01-01,3
8,Joe,Jarrod,2,2010-02-12,,20000,2010-02-12,2
9,Joe,Jarrod,2,2010-02-12,,20000,2012-03-14,3


In [271]:
query = """
SELECT ID, MAX(emp_count)
FROM
(
 SELECT *
 FROM employee INNER JOIN employee_count
 ORDER BY ID
) x
WHERE as_of >= HireDate AND (as_of <= TerminationDate OR TerminationDate IS NULL)
GROUP BY ID
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,ID,MAX(emp_count)
0,1,5
1,2,5
2,3,5
3,4,5
4,5,5
5,6,5


But I need the whole row of that GROUP BY:

https://stackoverflow.com/questions/7745609/sql-select-only-rows-with-max-value-on-a-column

Let's just save the table again so that I can follow the instructions

In [272]:
query = """
SELECT *
FROM
(
 SELECT *
 FROM employee INNER JOIN employee_count
 ORDER BY ID
) x
WHERE as_of >= HireDate AND (as_of <= TerminationDate OR TerminationDate IS NULL)
ORDER BY ID
"""
df_sql = pd.read_sql_query(query, disk_engine)
join_employee_count = df_sql

In [273]:
join_employee_count.to_sql('join_employee_count', disk_engine, if_exists='replace', index=False)

In [275]:
query = """
SELECT *
FROM join_employee_count
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2009-06-20,1
1,Bob,Smith,1,2009-06-20,2016-01-01,10000,2010-02-12,2
2,Bob,Smith,1,2009-06-20,2016-01-01,10000,2012-03-14,3
3,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
4,Bob,Smith,1,2009-06-20,2016-01-01,10000,2014-01-01,4
5,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-04-10,5
6,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-05-01,4
7,Bob,Smith,1,2009-06-20,2016-01-01,10000,2016-01-01,3
8,Joe,Jarrod,2,2010-02-12,,20000,2010-02-12,2
9,Joe,Jarrod,2,2010-02-12,,20000,2012-03-14,3


In [278]:
query = """
SELECT ID, MAX(emp_count)
FROM join_employee_count
GROUP BY ID
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,ID,MAX(emp_count)
0,1,5
1,2,5
2,3,5
3,4,5
4,5,5
5,6,5


Joining with simple group-indentifier, max-value-in-group Sub-query

First find group-identifier, max-value-in-group. Then join you table to the sub-query with equality in both

In [285]:
query = """
SELECT *
FROM join_employee_count a
INNER JOIN (
    SELECT ID, MAX(emp_count) emp_count
    FROM join_employee_count
    GROUP BY ID
)b
ON a.ID = b.ID AND a.emp_count = b.emp_count
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count,ID.1,emp_count.1
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5,1,5
1,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-04-10,5,1,5
2,Joe,Jarrod,2,2010-02-12,,20000,2013-09-10,5,2,5
3,Joe,Jarrod,2,2010-02-12,,20000,2015-04-10,5,2,5
4,Nancy,Soley,3,2012-03-14,,30000,2013-09-10,5,3,5
5,Nancy,Soley,3,2012-03-14,,30000,2015-04-10,5,3,5
6,Keith,Widjaja,4,2013-09-10,2014-01-01,20000,2013-09-10,5,4,5
7,Kelly,Smalls,5,2013-09-10,,20000,2013-09-10,5,5,5
8,Kelly,Smalls,5,2013-09-10,,20000,2015-04-10,5,5,5
9,Frank,Nguyen,6,2015-04-10,2015-05-01,60000,2015-04-10,5,6,5


The second solution:
Left Joining with self, tweaking join conditions and filters

In this approach, you left join the table with itself. Equality, of course, goes in the group-identifier. Then, 2 smart moves:
- The second join condition is having left side value less than right value
- When you do step 1, the row(s) that actually have the max will have NULL in the right side (it's LEFT JOIN, remember?). Then, we filter the joined result, showing only the rows where the right side is NULL.

In [339]:
query = """
SELECT a.*
FROM join_employee_count a
LEFT OUTER JOIN join_employee_count b
ON a.id = b.id AND a.emp_count < b.emp_count
WHERE b.id IS NULL
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
1,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-04-10,5
2,Joe,Jarrod,2,2010-02-12,,20000,2013-09-10,5
3,Joe,Jarrod,2,2010-02-12,,20000,2015-04-10,5
4,Nancy,Soley,3,2012-03-14,,30000,2013-09-10,5
5,Nancy,Soley,3,2012-03-14,,30000,2015-04-10,5
6,Keith,Widjaja,4,2013-09-10,2014-01-01,20000,2013-09-10,5
7,Kelly,Smalls,5,2013-09-10,,20000,2013-09-10,5
8,Kelly,Smalls,5,2013-09-10,,20000,2015-04-10,5
9,Frank,Nguyen,6,2015-04-10,2015-05-01,60000,2015-04-10,5


In [333]:
query = """
SELECT a.*
FROM

(
 SELECT a.*
 FROM join_employee_count a
 LEFT OUTER JOIN join_employee_count b
 ON a.id = b.id AND a.emp_count < b.emp_count
 WHERE b.id IS NULL
 ) a
 
INNER JOIN (
    SELECT ID, MIN(as_of) as_of
    FROM (
             SELECT a.*
             FROM join_employee_count a
             LEFT OUTER JOIN join_employee_count b
             ON a.id = b.id AND a.emp_count < b.emp_count
             WHERE b.id IS NULL
         ) x
    GROUP BY ID
 ) b
 ON a.ID = b.ID AND a.as_of = b.as_of
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
1,Joe,Jarrod,2,2010-02-12,,20000,2013-09-10,5
2,Nancy,Soley,3,2012-03-14,,30000,2013-09-10,5
3,Keith,Widjaja,4,2013-09-10,2014-01-01,20000,2013-09-10,5
4,Kelly,Smalls,5,2013-09-10,,20000,2013-09-10,5
5,Frank,Nguyen,6,2015-04-10,2015-05-01,60000,2015-04-10,5


In [362]:
query = """
SELECT a.*
FROM (
      SELECT a.*
      FROM join_employee_count a
      LEFT OUTER JOIN join_employee_count b
      ON a.id = b.id AND a.emp_count < b.emp_count
      WHERE b.id IS NULL) a
LEFT OUTER JOIN (
      SELECT a.*
      FROM join_employee_count a
      LEFT OUTER JOIN join_employee_count b
      ON a.id = b.id AND a.emp_count < b.emp_count
      WHERE b.id IS NULL) b
ON a.id = b.id AND a.as_of > b.as_of
WHERE b.id IS NULL
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
1,Joe,Jarrod,2,2010-02-12,,20000,2013-09-10,5
2,Nancy,Soley,3,2012-03-14,,30000,2013-09-10,5
3,Keith,Widjaja,4,2013-09-10,2014-01-01,20000,2013-09-10,5
4,Kelly,Smalls,5,2013-09-10,,20000,2013-09-10,5
5,Frank,Nguyen,6,2015-04-10,2015-05-01,60000,2015-04-10,5


# SUMS UP

First, we need a table that contains number of employee for every event time. Actually we've done something about event time (we call it activity time). We end up as follows:

https://stackoverflow.com/questions/25873630/advanced-mysql-query-select-and-compare-time-in-one-field

In [380]:
query = """
 SELECT t.ts as_of, COUNT(1) emp_count
   FROM employee e
   JOIN ( SELECT t.TerminationDate ts
          FROM employee t
          WHERE t.TerminationDate IS NOT NULL
          GROUP BY t.TerminationDate
          UNION
          SELECT h.HireDate ts
          FROM employee h
          WHERE h.HireDate IS NOT NULL
          GROUP BY h.HireDate
        ) t
   ON ( t.ts >= e.HireDate ) AND ( t.ts <  e.TerminationDate OR e.TerminationDate IS NULL)
   GROUP BY t.ts
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,as_of,emp_count
0,2009-06-20,1
1,2010-02-12,2
2,2012-03-14,3
3,2013-09-10,5
4,2014-01-01,4
5,2015-04-10,5
6,2015-05-01,4
7,2016-01-01,3


But how it works?

This is how we get time event or activity event

In [366]:
query = """
SELECT t.TerminationDate ts
FROM employee t
WHERE t.TerminationDate IS NOT NULL
GROUP BY t.TerminationDate
UNION
SELECT h.HireDate ts
FROM employee h
WHERE h.HireDate IS NOT NULL
GROUP BY h.HireDate
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,ts
0,2014-01-01
1,2015-05-01
2,2016-01-01
3,2009-06-20
4,2010-02-12
5,2012-03-14
6,2013-09-10
7,2015-04-10


In [367]:
query = """
SELECT TerminationDate ts
FROM employee
WHERE TerminationDate IS NOT NULL
UNION
SELECT HireDate ts
FROM employee
WHERE HireDate IS NOT NULL

"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,ts
0,2016-01-01
1,2014-01-01
2,2015-05-01
3,2009-06-20
4,2010-02-12
5,2012-03-14
6,2013-09-10
7,2015-04-10


We join the employee table with the activity ts on join conditions:
- activity ts >= hire date () and
- activity ts < termination date OR terminationDate is NULL



__Join__ is all about matching by something and then merge it. We search on the identifier on both table that match the given conditions. There are many kind of join but if only JOIN means INNER JOIN. Whenever we see the condition is True, we will match it.

In [383]:
query = """
SELECT *
FROM employee e
JOIN (
    SELECT TerminationDate ts
    FROM employee
    WHERE TerminationDate IS NOT NULL
    UNION
    SELECT HireDate ts
    FROM employee
    WHERE HireDate IS NOT NULL
) t
ON ( t.ts >= e.HireDate ) AND ( t.ts <  e.TerminationDate OR e.TerminationDate IS NULL)
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,ts
0,Joe,Jarrod,2,2010-02-12,,20000,2016-01-01
1,Nancy,Soley,3,2012-03-14,,30000,2016-01-01
2,Kelly,Smalls,5,2013-09-10,,20000,2016-01-01
3,Bob,Smith,1,2009-06-20,2016-01-01,10000,2014-01-01
4,Joe,Jarrod,2,2010-02-12,,20000,2014-01-01
5,Nancy,Soley,3,2012-03-14,,30000,2014-01-01
6,Kelly,Smalls,5,2013-09-10,,20000,2014-01-01
7,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-05-01
8,Joe,Jarrod,2,2010-02-12,,20000,2015-05-01
9,Nancy,Soley,3,2012-03-14,,30000,2015-05-01


In [82]:
query = """
SELECT t.ts as_of, COUNT(1) emp_count
FROM employee e
JOIN (
    SELECT TerminationDate ts
    FROM employee
    WHERE TerminationDate IS NOT NULL
    UNION
    SELECT HireDate ts
    FROM employee
    WHERE HireDate IS NOT NULL
) t
ON ( t.ts >= e.HireDate ) AND ( t.ts <  e.TerminationDate OR e.TerminationDate IS NULL)
GROUP BY t.ts
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,as_of,emp_count
0,2009-06-20,1
1,2010-02-12,2
2,2012-03-14,3
3,2013-09-10,5
4,2014-01-01,4
5,2015-04-10,5
6,2015-05-01,4
7,2016-01-01,3


All in one

In [110]:
query = """
SELECT *
FROM
(
 SELECT *
 FROM employee INNER JOIN (SELECT t.ts as_of, COUNT(1) emp_count
                           FROM employee e
                           JOIN (SELECT t.TerminationDate AS ts
                                 FROM employee t
                                 WHERE t.TerminationDate IS NOT NULL
                                 GROUP BY t.TerminationDate
                                 UNION
                                 SELECT h.HireDate AS ts
                                 FROM employee h
                                 WHERE h.HireDate IS NOT NULL
                                 GROUP BY h.HireDate
                                )t
                           ON ( t.ts >= e.HireDate )
                           AND ( t.ts <  e.TerminationDate OR e.TerminationDate IS NULL)
                           GROUP BY t.ts)employee_count
                           ORDER BY ID
) x
WHERE as_of >= HireDate AND (as_of <= TerminationDate OR TerminationDate IS NULL)
ORDER BY ID
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2009-06-20,1
1,Bob,Smith,1,2009-06-20,2016-01-01,10000,2010-02-12,2
2,Bob,Smith,1,2009-06-20,2016-01-01,10000,2012-03-14,3
3,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
4,Bob,Smith,1,2009-06-20,2016-01-01,10000,2014-01-01,4
5,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-04-10,5
6,Bob,Smith,1,2009-06-20,2016-01-01,10000,2015-05-01,4
7,Bob,Smith,1,2009-06-20,2016-01-01,10000,2016-01-01,3
8,Joe,Jarrod,2,2010-02-12,,20000,2010-02-12,2
9,Joe,Jarrod,2,2010-02-12,,20000,2012-03-14,3


In [112]:
query = """
SELECT a.*
FROM (
      SELECT a.*
      FROM join_employee_count a
      LEFT OUTER JOIN (SELECT *
                       FROM
                         (
                          SELECT *
                          FROM employee INNER JOIN (SELECT t.ts as_of, COUNT(1) emp_count
                          FROM employee e
                          JOIN (SELECT t.TerminationDate AS ts
                                 FROM employee t
                                 WHERE t.TerminationDate IS NOT NULL
                                 GROUP BY t.TerminationDate
                                 UNION
                                 SELECT h.HireDate AS ts
                                 FROM employee h
                                 WHERE h.HireDate IS NOT NULL
                                 GROUP BY h.HireDate
                                )t
                           ON ( t.ts >= e.HireDate )
                           AND ( t.ts <  e.TerminationDate OR e.TerminationDate IS NULL)
                           GROUP BY t.ts)employee_count
                           ORDER BY ID
    ) x
    WHERE as_of >= HireDate AND (as_of <= TerminationDate OR TerminationDate IS NULL)
    ORDER BY ID) b
      ON a.id = b.id AND a.emp_count < b.emp_count
      WHERE b.id IS NULL) a
      
LEFT OUTER JOIN (
      SELECT a.*
      FROM join_employee_count a
      LEFT OUTER JOIN (SELECT *
FROM
(
 SELECT *
 FROM employee INNER JOIN (SELECT t.ts as_of, COUNT(1) emp_count
                           FROM employee e
                           JOIN (SELECT t.TerminationDate AS ts
                                 FROM employee t
                                 WHERE t.TerminationDate IS NOT NULL
                                 GROUP BY t.TerminationDate
                                 UNION
                                 SELECT h.HireDate AS ts
                                 FROM employee h
                                 WHERE h.HireDate IS NOT NULL
                                 GROUP BY h.HireDate
                                )t
                           ON ( t.ts >= e.HireDate )
                           AND ( t.ts <  e.TerminationDate OR e.TerminationDate IS NULL)
                           GROUP BY t.ts)employee_count
                           ORDER BY ID
) x
WHERE as_of >= HireDate AND (as_of <= TerminationDate OR TerminationDate IS NULL)
ORDER BY ID) b
      ON a.id = b.id AND a.emp_count < b.emp_count
      WHERE b.id IS NULL) b
      
ON a.id = b.id AND a.as_of > b.as_of
WHERE b.id IS NULL
"""
df_sql = pd.read_sql_query(query, disk_engine)
df_sql

Unnamed: 0,FirstName,LastName,ID,HireDate,TerminationDate,Salary,as_of,emp_count
0,Bob,Smith,1,2009-06-20,2016-01-01,10000,2013-09-10,5
1,Joe,Jarrod,2,2010-02-12,,20000,2013-09-10,5
2,Nancy,Soley,3,2012-03-14,,30000,2013-09-10,5
3,Keith,Widjaja,4,2013-09-10,2014-01-01,20000,2013-09-10,5
4,Kelly,Smalls,5,2013-09-10,,20000,2013-09-10,5
5,Frank,Nguyen,6,2015-04-10,2015-05-01,60000,2015-04-10,5
