# Pandas PandaSQL

In [1]:
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
import pandasql as ps

In [3]:
# Changing the max_rows value 
# pd.set_option("display.max_rows", 5) 
# Changing the max_colwidth value 
pd.set_option("display.max_colwidth", 50) 

### Example 1

In [4]:
from pandasql import sqldf, load_meat, load_births
pysqldf = lambda q: sqldf(q, globals())

In [5]:
meat  = load_meat()
birth = load_births()

In [6]:
pysqldf("SELECT * FROM meat LIMIT 10;").head(3)

Unnamed: 0,date,beef,veal,pork,lamb_and_mutton,broilers,other_chicken,turkey
0,1944-01-01 00:00:00.000000,751.0,85.0,1280.0,89.0,,,
1,1944-02-01 00:00:00.000000,713.0,77.0,1169.0,72.0,,,
2,1944-03-01 00:00:00.000000,741.0,90.0,1128.0,75.0,,,


In [7]:
pysqldf("SELECT * FROM birth LIMIT 10;").head(3)

Unnamed: 0,date,births
0,1975-01-01 00:00:00.000000,265775
1,1975-02-01 00:00:00.000000,241045
2,1975-03-01 00:00:00.000000,268849


In [8]:
q = """
    SELECT
    m.date, m.beef, b.births
    FROM
    meat m
    INNER JOIN
    birth b
    ON m.date = b.date;
    """
pysqldf(q).head(3)

Unnamed: 0,date,beef,births
0,1975-01-01 00:00:00.000000,2106.0,265775
1,1975-02-01 00:00:00.000000,1845.0,241045
2,1975-03-01 00:00:00.000000,1891.0,268849


### Example 2

In [9]:
dataPoints = [20,29,49,76,106,125]
date1 = ['2014-01-01','2015-01-01','2016-01-01','2017-01-01','2018-01-01','2019-01-01','2020-01-01']
date2 = ['2014-12-31','2015-12-31','2016-12-31','2017-12-31','2018-12-31','2019-12-31','2020-12-31']

theDates = [] 
for i in range(6):
    t = pd.date_range(start=date1[i], end=date2[i], periods=dataPoints[i]).tolist()
    theDates.extend(t)

strDates = [ x.strftime('%Y-%m-%d') for x in theDates ]
hostid   = [x for x in range(100,505)]

In [10]:
df = pd.DataFrame({"host_since":strDates, "host_id": hostid})
df.head()

Unnamed: 0,host_since,host_id
0,2014-01-01,100
1,2014-01-20,101
2,2014-02-08,102
3,2014-02-27,103
4,2014-03-18,104


Create year column

In [11]:
q = """
    SELECT 
    *,
    CAST(SUBSTRING(host_since,1,4) AS INT) as year
    FROM df
    WHERE host_id IS NOT NULL;
    """
d = ps.sqldf(q, globals())
d.head(3)

Unnamed: 0,host_since,host_id,year
0,2014-01-01,100,2014
1,2014-01-20,101,2014
2,2014-02-08,102,2014


Count of hosts by year

In [12]:
q = """
    SELECT 
    year, 
    host_id_count
    FROM 
    (SELECT
    CAST(SUBSTRING(host_since,1,4) AS INT) as year,
    COUNT(host_id) AS host_id_count
    FROM df
    WHERE host_id IS NOT NULL 
    GROUP BY SUBSTRING(host_since,1,4)
    ORDER BY SUBSTRING(host_since,1,4)) AS f1;
    """
d = ps.sqldf(q, globals())
d

Unnamed: 0,year,host_id_count
0,2014,20
1,2015,29
2,2016,49
3,2017,76
4,2018,106
5,2019,125


Use of lag/lead to get previous year or next year data

In [13]:
q = """
    SELECT 
    year, 
    host_count as current_year_count, 
    LAG(host_count,1,0) OVER (ORDER BY year) as prev_year_count
    FROM 
    (SELECT *,
    CAST(SUBSTRING(host_since,1,4) AS INT) as year,
    COUNT(host_id) AS host_count
    FROM df
    WHERE host_id IS NOT NULL 
    GROUP BY SUBSTRING(host_since,1,4)
    ORDER BY SUBSTRING(host_since,1,4)) AS f1;
    """
d = ps.sqldf(q, globals())
d

Unnamed: 0,year,current_year_count,prev_year_count
0,2014,20,0
1,2015,29,20
2,2016,49,29
3,2017,76,49
4,2018,106,76
5,2019,125,106


Use 'Sub-Queries'

In [14]:
q = """
    SELECT 
    *, 
    ROUND(100*(CAST(current_year_count AS REAL)-prev_year_count)/prev_year_count,2) as growth_rate 
    FROM (
    SELECT 
    year, 
    host_count as current_year_count, 
    LAG(host_count,1,0) OVER (ORDER BY year) as prev_year_count
    FROM 
    (SELECT *,
    CAST(SUBSTRING(host_since,1,4) AS INT) as year,
    COUNT(host_id) AS host_count
    FROM df
    WHERE host_id IS NOT NULL 
    GROUP BY SUBSTRING(host_since,1,4)
    ORDER BY SUBSTRING(host_since,1,4)) AS f1 ) as f2;
    """
d = ps.sqldf(q, globals())
d

Unnamed: 0,year,current_year_count,prev_year_count,growth_rate
0,2014,20,0,
1,2015,29,20,45.0
2,2016,49,29,68.97
3,2017,76,49,55.1
4,2018,106,76,39.47
5,2019,125,106,17.92


Use 'With' Caluse

In [15]:
q = """
    WITH f1 AS
    (SELECT *,
    CAST(SUBSTRING(host_since,1,4) AS INT) as year,
    COUNT(host_id) AS host_count
    FROM df
    WHERE host_id IS NOT NULL 
    GROUP BY SUBSTRING(host_since,1,4)
    ORDER BY SUBSTRING(host_since,1,4)),
    f2 AS
    (SELECT
    year,
    CAST(host_count AS FLOAT) as current_year_count,
    CAST(LAG(host_count,1,0) OVER (ORDER BY year) AS FLOAT) as last_year_count
    FROM f1),
    f3 AS
    (SELECT
    f2.*,
    ROUND(100*(f2.current_year_count-f2.last_year_count)/f2.last_year_count, 2) as frac_increase
    FROM f2)
    SELECT * from f3;
    """
d = ps.sqldf(q, globals())
d.head(3)

Unnamed: 0,year,current_year_count,last_year_count,frac_increase
0,2014,20.0,0.0,
1,2015,29.0,20.0,45.0
2,2016,49.0,29.0,68.97


Practice

In [16]:
myquery = """
WITH f1 AS
(SELECT 
CAST(SUBSTR(host_since,1,4) as INT) as year,
CAST( COUNT(host_id) as FLOAT) as host_id_count
FROM df
WHERE host_id IS NOT NULL
GROUP BY SUBSTR(host_since,1,4)
ORDER BY SUBSTR(host_since,1,4)),
f2 AS(
SELECT 
*,
CAST( LAG(host_id_count,1,0) OVER (ORDER BY year) as FLOAT ) as host_id_lagged
FROM f1)
SELECT 
*,
ROUND(100*(f2.host_id_count - f2.host_id_lagged)/f2.host_id_lagged, 1) AS fraction
FROM f2
"""

ps.sqldf(myquery, globals())

Unnamed: 0,year,host_id_count,host_id_lagged,fraction
0,2014,20.0,0.0,
1,2015,29.0,20.0,45.0
2,2016,49.0,29.0,69.0
3,2017,76.0,49.0,55.1
4,2018,106.0,76.0,39.5
5,2019,125.0,106.0,17.9
