Import MyDataBase class.

In [1]:
from src.database import MyDataBase
db_name = "parch_and_posey"

Load database.

In [2]:
database = MyDataBase(db_name)
database.print_tables_names()

ISOLATION_LEVEL_AUTOCOMMIT: 0
('web_events',)
('sales_reps',)
('region',)
('orders',)
('accounts',)


* See each account who has a sales rep and each sales rep that has an account
(all of the columns in these returned rows will be full)

* but also each account that does not have a sales rep and each sales rep that does not have an account
(some of the columns in these returned rows will be empty)

In [3]:
q = """
SELECT accounts.id,
	   accounts.sales_rep_id AS a_id,
       sales_reps.id AS s_id
FROM accounts
FULL OUTER JOIN sales_reps ON accounts.sales_rep_id = sales_reps.id;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,id,a_id,s_id
0,1691,321500,321500
1,1661,321500,321500
2,1631,321500,321500
3,1421,321500,321500
4,1211,321500,321500
...,...,...,...
346,3991,321990,321990
347,3891,321990,321990
348,3791,321990,321990
349,3691,321990,321990


### Finding Matched and Unmatched Rows with FULL OUTER JOIN

In [4]:
q = """
SELECT *
  FROM accounts
 FULL JOIN sales_reps ON accounts.sales_rep_id = sales_reps.id
 WHERE accounts.sales_rep_id IS NULL OR sales_reps.id IS NULL
"""

table = database.to_data_frame(q)
table

Unnamed: 0,id,name,website,lat,long,primary_poc,sales_rep_id,id.1,name.1,region_id


### Inequality JOINs

In [5]:
q = """
SELECT orders.id order_id,
       orders.occurred_at AS order_date,
       events.*
    FROM orders
    LEFT JOIN web_events events
    ON events.account_id = orders.account_id
    AND events.occurred_at < orders.occurred_at
    WHERE DATE_TRUNC('month', orders.occurred_at) =
        (SELECT DATE_TRUNC('month', MIN(orders.occurred_at)) FROM orders)
    ORDER BY orders.account_id, orders.occurred_at;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,order_id,order_date,id,account_id,occurred_at,channel
0,147,2013-12-17 23:02:57,153,1181,2013-12-17 22:46:54,direct
1,147,2013-12-17 23:02:57,4563,1181,2013-12-17 04:45:29,adwords
2,243,2013-12-11 20:36:06,4677,1251,2013-12-11 02:43:32,organic
3,243,2013-12-11 20:36:06,250,1251,2013-12-11 20:17:01,direct
4,4457,2013-12-11 20:42:09,250,1251,2013-12-11 20:17:01,direct
...,...,...,...,...,...,...
161,4150,2013-12-10 16:25:51,8871,4341,2013-12-10 09:26:27,organic
162,4150,2013-12-10 16:25:51,4235,4341,2013-12-10 16:01:36,direct
163,4226,2013-12-09 22:19:26,4312,4451,2013-12-09 21:59:11,direct
164,4226,2013-12-09 22:19:26,8966,4451,2013-12-09 03:23:36,banner


Write a query that left joins the `accounts` table and the `sales_reps` tables on each sale rep's ID number
and joins it using the `<` comparison operator on `accounts.primary_poc` and `sales_reps.name`, like so:

In [6]:
q = """
SELECT accounts.name as account_name,
       accounts.primary_poc as poc_name,
       sales_reps.name as sales_rep_name
  FROM accounts
  LEFT JOIN sales_reps
    ON accounts.sales_rep_id = sales_reps.id
   AND accounts.primary_poc < sales_reps.name
"""

table = database.to_data_frame(q)
table

Unnamed: 0,account_name,poc_name,sales_rep_name
0,Johnson Controls,Cammy Sosnowski,Samuel Racine
1,Ingram Micro,Chanelle Keach,Samuel Racine
2,Freddie Mac,Elayne Grunewald,Samuel Racine
3,Express Scripts Holding,Jewell Likes,Samuel Racine
4,Delta Air Lines,Enola Thoms,Eugena Esser
...,...,...,...
346,PBF Energy,Lavonda Hoyle,
347,Nordstrom,Yan Crater,
348,DISH Network,Leana Hawker,
349,NGL Energy Partners,Staci Alegria,


### Self JOINs

In [7]:
q = """
SELECT o1.id AS o1_id,
       o1.account_id AS o1_account_id,
       o1.occurred_at AS o1_occurred_at,
       o2.id AS o2_id,
       o2.account_id AS o2_account_id,
       o2.occurred_at AS o2_occurred_at
  FROM orders o1
 LEFT JOIN orders o2
   ON o1.account_id = o2.account_id
  AND o2.occurred_at > o1.occurred_at
  AND o2.occurred_at <= o1.occurred_at + INTERVAL '28 days'
ORDER BY o1.account_id, o1.occurred_at
"""

table = database.to_data_frame(q)
table

Unnamed: 0,o1_id,o1_account_id,o1_occurred_at,o2_id,o2_account_id,o2_occurred_at
0,1,1001,2015-10-06 17:31:14,,,NaT
1,4307,1001,2015-11-05 03:25:21,2.0,1001.0,2015-11-05 03:34:33
2,2,1001,2015-11-05 03:34:33,,,NaT
3,4308,1001,2015-12-04 04:01:09,3.0,1001.0,2015-12-04 04:21:55
4,3,1001,2015-12-04 04:21:55,,,NaT
...,...,...,...,...,...,...
6907,4304,4501,2016-10-24 08:50:37,,,NaT
6908,6911,4501,2016-11-22 06:52:22,4305.0,4501.0,2016-11-22 06:57:04
6909,4305,4501,2016-11-22 06:57:04,,,NaT
6910,6912,4501,2016-12-21 13:30:42,4306.0,4501.0,2016-12-21 13:43:26


* change the interval to 1 day to find those web events that occurred after, but not more than 1 day after,
another web event

* add a column for the channel variable in both instances of the table in your query

In [8]:
q = """
SELECT we1.id AS we_id,
       we1.account_id AS we1_account_id,
       we1.occurred_at AS we1_occurred_at,
       we1.channel AS we1_channel,
       we2.id AS we2_id,
       we2.account_id AS we2_account_id,
       we2.occurred_at AS we2_occurred_at,
       we2.channel AS we2_channel
  FROM web_events we1
 LEFT JOIN web_events we2
   ON we1.account_id = we2.account_id
  AND we1.occurred_at > we2.occurred_at
  AND we1.occurred_at <= we2.occurred_at + INTERVAL '1 day'
ORDER BY we1.account_id, we2.occurred_at
"""

table = database.to_data_frame(q)
table


Unnamed: 0,we_id,we1_account_id,we1_occurred_at,we1_channel,we2_id,we2_account_id,we2_occurred_at,we2_channel
0,1,1001,2015-10-06 17:13:58,direct,4394.0,1001.0,2015-10-06 04:22:11,facebook
1,4396,1001,2015-10-22 14:04:20,adwords,4395.0,1001.0,2015-10-22 05:02:47,organic
2,4397,1001,2015-11-05 17:18:54,direct,2.0,1001.0,2015-11-05 03:08:26,direct
3,4,1001,2016-01-02 00:55:03,direct,4399.0,1001.0,2016-01-01 15:45:54,adwords
4,8,1001,2016-05-01 15:26:44,direct,4406.0,1001.0,2016-05-01 14:26:40,direct
...,...,...,...,...,...,...,...,...
9143,4393,4501,2016-12-21 13:14:30,direct,,,NaT,
9144,9073,4501,2016-05-30 00:46:53,organic,,,NaT,
9145,4392,4501,2016-11-22 06:50:40,direct,,,NaT,
9146,4390,4501,2016-09-25 01:29:32,direct,,,NaT,


### Appending Data via ***UNION***

* ***UNION*** removes duplicate rows.
* ***UNION ALL*** does not remove duplicate rows.

* Both tables must have the same number of columns.
* Those columns must have the same data types in the same order as the first table.

A common misconception is that column names have to be the same.
Column names, in fact, ***don't*** need to be the same to append two tables
but you will find that they typically are.


Write a query that uses ***UNION ALL*** on two instances (and selecting all columns) of the `accounts` table.

In [9]:
q = """
SELECT *
FROM accounts
WHERE name = 'Walmart'

UNION ALL

SELECT *
FROM accounts
WHERE name = 'Disney'
"""

table = database.to_data_frame(q)
table


Unnamed: 0,id,name,website,lat,long,primary_poc,sales_rep_id
0,1001,Walmart,www.walmart.com,40.238496,-75.103297,Tamara Tuma,321500
1,1521,Disney,www.disney.com,41.8788,-74.811026,Timika Mistretta,321600


Or alternatively:

In [10]:
q = """
SELECT *
FROM accounts
WHERE name = 'Walmart' OR name = 'Disney'
"""

table = database.to_data_frame(q)
table

Unnamed: 0,id,name,website,lat,long,primary_poc,sales_rep_id
0,1001,Walmart,www.walmart.com,40.238496,-75.103297,Tamara Tuma,321500
1,1521,Disney,www.disney.com,41.8788,-74.811026,Timika Mistretta,321600


Perform the union in your first query (under the Appending Data via UNION header)
in a common table expression and name it `double_accounts`.
Then do a ***COUNT*** the number of times a name appears in the `double_accounts` table.
If you do this correctly, your query results should have a count of 2 for each name.

In [11]:
q = """
WITH double_accounts AS (
    SELECT *
      FROM accounts

    UNION ALL

    SELECT *
      FROM accounts
)

SELECT name,
       COUNT(*) AS name_count
 FROM double_accounts
GROUP BY 1
ORDER BY 2 DESC
"""

table = database.to_data_frame(q)
table

Unnamed: 0,name,name_count
0,Boeing,2
1,Western Digital,2
2,Sysco,2
3,Southern,2
4,Altria Group,2
...,...,...
346,AIG,2
347,Norfolk Southern,2
348,KKR,2
349,SpartanNash,2


### Performance Tuning

One way to make a query run faster is to reduce the number of calculations that need to be performed.
Some of the high-level things that will affect the number of calculations a given query will make include:

* Table size
* Joins
* Aggregations

Query runtime is also dependent on some things that you can’t really control related to the database itself:

* Other users running queries concurrently on the database
* Database software and optimization (e.g., Postgres is optimized differently than Redshift)

In [12]:
q = """
EXPLAIN
SELECT *
    FROM web_events
    WHERE occurred_at >= '2015-01-01'
    AND occurred_at < '2016-02-01'
"""

table = database.to_data_frame(q)
table

Unnamed: 0,QUERY PLAN
0,Seq Scan on web_events (cost=0.00..195.09 row...
1,Filter: ((occurred_at >= '2015-01-01 00:00:0...


In [13]:
q = """
SELECT DATE_TRUNC('day', o.occurred_at) AS day,
       COUNT(DISTINCT a.sales_rep_id) AS active_sales_reps,
       COUNT(DISTINCT o.id) AS orders,
       COUNT(DISTINCT we.id) AS web_visits
    FROM accounts a
    JOIN orders o
    ON o.account_id = a.id
    JOIN web_events we
    ON DATE_TRUNC('day', we.occurred_at) = DATE_TRUNC('day', o.occurred_at)
GROUP BY 1
ORDER BY 1 DESC
"""

table = database.to_data_frame(q)
table

Unnamed: 0,day,active_sales_reps,orders,web_visits
0,2017-01-01,13,24,31
1,2016-12-31,15,26,27
2,2016-12-30,7,11,18
3,2016-12-29,7,11,19
4,2016-12-28,12,22,31
...,...,...,...,...
1054,2013-12-09,2,3,5
1055,2013-12-08,5,8,10
1056,2013-12-06,4,7,9
1057,2013-12-05,1,2,2


In [14]:
q = """
SELECT COALESCE(orders.date, web_events.date) AS date,
       orders.active_sales_reps,
       orders.orders,
       web_events.web_visits
    FROM (
        SELECT DATE_TRUNC('day', o.occurred_at) AS date,
               COUNT(a.sales_rep_id) AS active_sales_reps,
               COUNT(o.id) AS orders
            FROM accounts a
            JOIN orders o
            ON o.account_id = a.id
        GROUP BY 1
        ) orders

FULL JOIN

(
SELECT DATE_TRUNC('day', we.occurred_at) AS date,
    COUNT(we.id) AS web_visits
    FROM web_events we
    GROUP BY 1
) web_events

    ON web_events.date = orders.date
    ORDER BY 1 DESC
"""

table = database.to_data_frame(q)
table

Unnamed: 0,date,active_sales_reps,orders,web_visits
0,2017-01-02,1.0,1.0,
1,2017-01-01,24.0,24.0,31.0
2,2016-12-31,26.0,26.0,27.0
3,2016-12-30,11.0,11.0,18.0
4,2016-12-29,11.0,11.0,19.0
...,...,...,...,...
1115,2013-12-09,3.0,3.0,5.0
1116,2013-12-08,8.0,8.0,10.0
1117,2013-12-06,7.0,7.0,9.0
1118,2013-12-05,2.0,2.0,2.0


In [15]:
database.close()