Import MyDataBase class.

In [1]:
from src.database import MyDataBase
db_name = "parch_and_posey"

Load database.

In [2]:
database = MyDataBase(db_name)
database.print_tables_names()

ISOLATION_LEVEL_AUTOCOMMIT: 0
('web_events',)
('sales_reps',)
('region',)
('orders',)
('accounts',)


Print first rows of tables.

In [3]:
for table_name in ['web_events', 'sales_reps', 'region', 'orders', 'accounts']:
    query = "SELECT * FROM {} LIMIT 2;".format(table_name)
    table = database.to_data_frame(query)
    print(table_name)
    print(table)
    print('\n')

web_events
   id  account_id         occurred_at channel
0   1        1001 2015-10-06 17:13:58  direct
1   2        1001 2015-11-05 03:08:26  direct


sales_reps
       id           name  region_id
0  321500  Samuel Racine          1
1  321510   Eugena Esser          1


region
   id       name
0   1  Northeast
1   2    Midwest


orders
   id  account_id         occurred_at  standard_qty  gloss_qty  poster_qty  \
0   1        1001 2015-10-06 17:31:14           123         22          24   
1   2        1001 2015-11-05 03:34:33           190         41          57   

   total  standard_amt_usd  gloss_amt_usd  poster_amt_usd  total_amt_usd  
0    169            613.77         164.78          194.88         973.43  
1    288            948.10         307.09          462.84        1718.03  


accounts
     id         name             website        lat       long   primary_poc  \
0  1001      Walmart     www.walmart.com  40.238496 -75.103297   Tamara Tuma   
1  1011  Exxon Mobil  www.exxo

1. Provide the name of the sales_rep in each region with the largest amount of total_amt_usd sales.

First, I wanted to find the total_amt_usd totals associated with each sales rep,
and I also wanted the region in which they were located. The query below provided this information.

In [4]:
q1 = """
SELECT s.name rep_name, r.name region_name, SUM(o.total_amt_usd) total_amt
FROM sales_reps s
JOIN accounts a
ON a.sales_rep_id = s.id
JOIN orders o
ON o.account_id = a.id
JOIN region r
ON r.id = s.region_id
GROUP BY 1,2
ORDER BY 3 DESC
LIMIT 5;
"""

table = database.to_data_frame(q1)
table

Unnamed: 0,rep_name,region_name,total_amt
0,Earlie Schleusner,Southeast,1098137.72
1,Tia Amato,Northeast,1010690.6
2,Vernita Plump,Southeast,934212.93
3,Georgianna Chisholm,West,886244.12
4,Arica Stoltzfus,West,810353.34


Next, I pulled the max for each region, and then we can use this to pull those rows in our final result.

In [5]:
q1 = """
SELECT region_name, MAX(total_amt) total_amt
     FROM(SELECT s.name rep_name, r.name region_name, SUM(o.total_amt_usd) total_amt
             FROM sales_reps s
             JOIN accounts a
             ON a.sales_rep_id = s.id
             JOIN orders o
             ON o.account_id = a.id
             JOIN region r
             ON r.id = s.region_id
             GROUP BY 1, 2) t1
     GROUP BY 1;
"""

table = database.to_data_frame(q1)
table

Unnamed: 0,region_name,total_amt
0,Midwest,675637.19
1,Southeast,1098137.72
2,Northeast,1010690.6
3,West,886244.12


Essentially, this is a JOIN of these two tables, where the region and amount match.

In [6]:
q1 = """
SELECT t3.rep_name, t3.region_name, t3.total_amt
FROM(SELECT region_name, MAX(total_amt) total_amt
     FROM(SELECT s.name rep_name, r.name region_name, SUM(o.total_amt_usd) total_amt
             FROM sales_reps s
             JOIN accounts a
             ON a.sales_rep_id = s.id
             JOIN orders o
             ON o.account_id = a.id
             JOIN region r
             ON r.id = s.region_id
             GROUP BY 1, 2) t1
     GROUP BY 1) t2
JOIN (SELECT s.name rep_name, r.name region_name, SUM(o.total_amt_usd) total_amt
     FROM sales_reps s
     JOIN accounts a
     ON a.sales_rep_id = s.id
     JOIN orders o
     ON o.account_id = a.id
     JOIN region r
     ON r.id = s.region_id
     GROUP BY 1,2
     ORDER BY 3 DESC) t3
ON t3.region_name = t2.region_name AND t3.total_amt = t2.total_amt;
"""

table = database.to_data_frame(q1)
table

Unnamed: 0,rep_name,region_name,total_amt
0,Earlie Schleusner,Southeast,1098137.72
1,Tia Amato,Northeast,1010690.6
2,Georgianna Chisholm,West,886244.12
3,Charles Bidwell,Midwest,675637.19


2. For the region with the largest (sum) of sales total_amt_usd, how many total (count) orders were placed?

The first query I wrote was to pull the total_amt_usd for each region.

In [7]:
q2 = """
SELECT r.name region_name, SUM(o.total_amt_usd) total_amt
FROM sales_reps s
JOIN accounts a
ON a.sales_rep_id = s.id
JOIN orders o
ON o.account_id = a.id
JOIN region r
ON r.id = s.region_id
GROUP BY r.name;
"""

table = database.to_data_frame(q2)
table

Unnamed: 0,region_name,total_amt
0,Midwest,3013486.51
1,Southeast,6458497.0
2,Northeast,7744405.36
3,West,5925122.96


Then we just want the region with the max amount from this table.
There are two ways I considered getting this amount.
One was to pull the max using a subquery.
Another way is to order descending and just pull the top value.

In [8]:
q2 = """
SELECT MAX(total_amt)
FROM (SELECT r.name region_name, SUM(o.total_amt_usd) total_amt
             FROM sales_reps s
             JOIN accounts a
             ON a.sales_rep_id = s.id
             JOIN orders o
             ON o.account_id = a.id
             JOIN region r
             ON r.id = s.region_id
             GROUP BY r.name) sub;
"""

table = database.to_data_frame(q2)
table

Unnamed: 0,max
0,7744405.36


Finally, we want to pull the total orders for the region with this amount:

In [9]:
q2 = """
SELECT r.name, COUNT(o.total) total_orders
FROM sales_reps s
JOIN accounts a
ON a.sales_rep_id = s.id
JOIN orders o
ON o.account_id = a.id
JOIN region r
ON r.id = s.region_id
GROUP BY r.name
HAVING SUM(o.total_amt_usd) = (
      SELECT MAX(total_amt)
      FROM (SELECT r.name region_name, SUM(o.total_amt_usd) total_amt
              FROM sales_reps s
              JOIN accounts a
              ON a.sales_rep_id = s.id
              JOIN orders o
              ON o.account_id = a.id
              JOIN region r
              ON r.id = s.region_id
              GROUP BY r.name) sub);
"""

table = database.to_data_frame(q2)
table

Unnamed: 0,name,total_orders
0,Northeast,2357


3. How many accounts had more total purchases than the account name which has bought the most standard_qty paper throughout their lifetime as a customer?

First, we want to find the account that had the most standard_qty paper.
The query here pulls that account, as well as the total amount:

In [10]:
q3 = """
SELECT a.name account_name, SUM(o.standard_qty) total_std, SUM(o.total) total
FROM accounts a
JOIN orders o
ON o.account_id = a.id
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;
"""

table = database.to_data_frame(q3)
table

Unnamed: 0,account_name,total_std,total
0,Core-Mark Holding,41617,44750


Now, I want to use this to pull all the accounts with more total sales:

In [11]:
q3 = """
SELECT a.name
FROM orders o
JOIN accounts a
ON a.id = o.account_id
GROUP BY 1
HAVING SUM(o.total) > (SELECT total
                   FROM (SELECT a.name act_name, SUM(o.standard_qty) tot_std, SUM(o.total) total
                         FROM accounts a
                         JOIN orders o
                         ON o.account_id = a.id
                         GROUP BY 1
                         ORDER BY 2 DESC
                         LIMIT 1) sub);
"""

table = database.to_data_frame(q3)
table

Unnamed: 0,name
0,Mosaic
1,EOG Resources
2,IBM


This is now a list of all the accounts with more total orders.
We can get the count with just another simple subquery.

In [12]:
q3 = """
SELECT COUNT(*)
FROM (SELECT a.name
       FROM orders o
       JOIN accounts a
       ON a.id = o.account_id
       GROUP BY 1
       HAVING SUM(o.total) > (SELECT total
                   FROM (SELECT a.name act_name, SUM(o.standard_qty) tot_std, SUM(o.total) total
                         FROM accounts a
                         JOIN orders o
                         ON o.account_id = a.id
                         GROUP BY 1
                         ORDER BY 2 DESC
                         LIMIT 1) inner_tab)
             ) counter_tab;
"""

table = database.to_data_frame(q3)
table

Unnamed: 0,count
0,3


4. For the customer that spent the most (in total over their lifetime as a customer) total_amt_usd,
how many web_events did they have for each channel?

Here, we first want to pull the customer with the most spent in lifetime value.

In [13]:
q4 = """
SELECT a.id, a.name, SUM(o.total_amt_usd) tot_spent
FROM orders o
JOIN accounts a
ON a.id = o.account_id
GROUP BY a.id, a.name
ORDER BY 3 DESC
LIMIT 1;
"""

table = database.to_data_frame(q4)
table

Unnamed: 0,id,name,tot_spent
0,4211,EOG Resources,382873.3


Now, we want to look at the number of events on each channel this company had,
which we can match with just the id.

In [14]:
q4 = """
SELECT a.name, w.channel, COUNT(*)
FROM accounts a
JOIN web_events w
ON a.id = w.account_id AND a.id =  (SELECT id
                     FROM (SELECT a.id, a.name, SUM(o.total_amt_usd) tot_spent
                           FROM orders o
                           JOIN accounts a
                           ON a.id = o.account_id
                           GROUP BY a.id, a.name
                           ORDER BY 3 DESC
                           LIMIT 1) inner_table)
GROUP BY 1, 2
ORDER BY 3 DESC;
"""

table = database.to_data_frame(q4)
table

Unnamed: 0,name,channel,count
0,EOG Resources,direct,44
1,EOG Resources,organic,13
2,EOG Resources,adwords,12
3,EOG Resources,facebook,11
4,EOG Resources,twitter,5
5,EOG Resources,banner,4


I added an ORDER BY for no real reason, and the account name to assure I was only pulling from one account.

5. What is the lifetime average amount spent in terms of total_amt_usd for the top 10 total spending accounts?

First, we just want to find the top 10 accounts in terms of highest total_amt_usd.

In [15]:
q5 = """
SELECT a.id, a.name, SUM(o.total_amt_usd) tot_spent
FROM orders o
JOIN accounts a
ON a.id = o.account_id
GROUP BY a.id, a.name
ORDER BY 3 DESC
LIMIT 10;
"""

table = database.to_data_frame(q5)
table

Unnamed: 0,id,name,tot_spent
0,4211,EOG Resources,382873.3
1,4151,Mosaic,345618.59
2,1301,IBM,326819.48
3,1871,General Dynamics,300694.79
4,4111,Republic Services,293861.14
5,3411,Leucadia National,291047.25
6,2181,Arrow Electronics,281018.36
7,1561,Sysco,278575.64
8,2591,Supervalu,275288.3
9,1401,Archer Daniels Midland,272672.84


Now, we just want the average of these 10 amounts.

In [16]:
q5 = """
SELECT AVG(tot_spent)
FROM (SELECT a.id, a.name, SUM(o.total_amt_usd) tot_spent
      FROM orders o
      JOIN accounts a
      ON a.id = o.account_id
      GROUP BY a.id, a.name
      ORDER BY 3 DESC
       LIMIT 10) temp;
"""

table = database.to_data_frame(q5)
table

Unnamed: 0,avg
0,304846.969


6. What is the lifetime average amount spent in terms of total_amt_usd, including only the companies that spent more per order, on average, than the average of all orders.

First, we want to pull the average of all accounts in terms of total_amt_usd:

In [17]:
q6 = """
SELECT AVG(o.total_amt_usd) avg_all
FROM orders o
"""

table = database.to_data_frame(q6)
table

Unnamed: 0,avg_all
0,3348.019651


Then, we want to only pull the accounts with more than this average amount.

In [18]:
q6 = """
SELECT o.account_id, AVG(o.total_amt_usd)
FROM orders o
GROUP BY 1
HAVING AVG(o.total_amt_usd) > (SELECT AVG(o.total_amt_usd) avg_all
                               FROM orders o);
"""

table = database.to_data_frame(q6)
table

Unnamed: 0,account_id,avg
0,2651,5106.079375
1,2941,5389.885000
2,1501,3993.271429
3,1351,4440.722000
4,1721,4230.887500
...,...,...
164,3811,7082.654444
165,2911,4863.653333
166,2921,3732.950625
167,2591,4048.357353


Finally, we just want the average of these values.

In [19]:
q6 = """
SELECT AVG(avg_amt)
FROM (SELECT o.account_id, AVG(o.total_amt_usd) avg_amt
    FROM orders o
    GROUP BY 1
    HAVING AVG(o.total_amt_usd) > (SELECT AVG(o.total_amt_usd) avg_all
                                   FROM orders o)) temp_table;
"""

table = database.to_data_frame(q6)
table

Unnamed: 0,avg
0,4721.139744


7. What is the average number of events for each channel per day.

In [20]:
q = """
SELECT channel, AVG(events) AS average_events
FROM (SELECT DATE_TRUNC('day',occurred_at) AS day,
             channel, COUNT(*) as events
      FROM web_events
      GROUP BY 1,2) sub
GROUP BY channel
ORDER BY 2 DESC;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,channel,average_events
0,direct,4.896488
1,organic,1.66725
2,facebook,1.598347
3,adwords,1.570191
4,twitter,1.316667
5,banner,1.289973


Let's try this again using a WITH statement.
Notice, you can pull the inner query:

In [21]:
q = """
SELECT DATE_TRUNC('day',occurred_at) AS day,
       channel, COUNT(*) as events
FROM web_events
GROUP BY 1,2
"""

table = database.to_data_frame(q)
table

Unnamed: 0,day,channel,events
0,2015-11-04,banner,1
1,2013-12-11,organic,1
2,2016-01-06,direct,8
3,2016-05-07,facebook,5
4,2016-10-28,direct,16
...,...,...,...
3559,2016-03-24,twitter,1
3560,2015-05-27,direct,2
3561,2015-10-25,facebook,2
3562,2014-01-12,direct,3


This is the part we put in the WITH statement.
Notice, we are aliasing the table as events below:

Now, we can use this newly created events table as if it is any other table in our database:

In [22]:
q = """
WITH events AS (
          SELECT DATE_TRUNC('day',occurred_at) AS day,
                        channel, COUNT(*) as events
          FROM web_events
          GROUP BY 1,2)

SELECT channel, AVG(events) AS average_events
FROM events
GROUP BY channel
ORDER BY 2 DESC;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,channel,average_events
0,direct,4.896488
1,organic,1.66725
2,facebook,1.598347
3,adwords,1.570191
4,twitter,1.316667
5,banner,1.289973


For the above example, we don't need anymore than the one additional table,
but imagine we needed to create a second table to pull from.
We can create an additional table to pull from in the following way:

Below, you will see each of the previous solutions restructured using the WITH clause.
This is often an easier way to read a query.

1. Provide the name of the sales_rep in each region with the largest amount of total_amt_usd sales.

In [23]:
q = """
WITH t1 AS (
  SELECT s.name rep_name, r.name region_name, SUM(o.total_amt_usd) total_amt
   FROM sales_reps s
   JOIN accounts a
   ON a.sales_rep_id = s.id
   JOIN orders o
   ON o.account_id = a.id
   JOIN region r
   ON r.id = s.region_id
   GROUP BY 1,2
   ORDER BY 3 DESC),
t2 AS (
   SELECT region_name, MAX(total_amt) total_amt
   FROM t1
   GROUP BY 1)
SELECT t1.rep_name, t1.region_name, t1.total_amt
FROM t1
JOIN t2
ON t1.region_name = t2.region_name AND t1.total_amt = t2.total_amt;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,rep_name,region_name,total_amt
0,Earlie Schleusner,Southeast,1098137.72
1,Tia Amato,Northeast,1010690.6
2,Georgianna Chisholm,West,886244.12
3,Charles Bidwell,Midwest,675637.19


2. For the region with the largest sales total_amt_usd, how many total orders were placed?

In [24]:
q = """
WITH t1 AS (
   SELECT r.name region_name, SUM(o.total_amt_usd) total_amt
   FROM sales_reps s
   JOIN accounts a
   ON a.sales_rep_id = s.id
   JOIN orders o
   ON o.account_id = a.id
   JOIN region r
   ON r.id = s.region_id
   GROUP BY r.name),
t2 AS (
   SELECT MAX(total_amt)
   FROM t1)
SELECT r.name, COUNT(o.total) total_orders
FROM sales_reps s
JOIN accounts a
ON a.sales_rep_id = s.id
JOIN orders o
ON o.account_id = a.id
JOIN region r
ON r.id = s.region_id
GROUP BY r.name
HAVING SUM(o.total_amt_usd) = (SELECT * FROM t2);
"""

table = database.to_data_frame(q)
table

Unnamed: 0,name,total_orders
0,Northeast,2357


3. For the account that purchased the most (in total over their lifetime as a customer) standard_qty paper,
how many accounts still had more in total purchases?

In [25]:
q = """
WITH t1 AS (
  SELECT a.name account_name, SUM(o.standard_qty) total_std, SUM(o.total) total
  FROM accounts a
  JOIN orders o
  ON o.account_id = a.id
  GROUP BY 1
  ORDER BY 2 DESC
  LIMIT 1),
t2 AS (
  SELECT a.name
  FROM orders o
  JOIN accounts a
  ON a.id = o.account_id
  GROUP BY 1
  HAVING SUM(o.total) > (SELECT total FROM t1))
SELECT COUNT(*)
FROM t2;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,count
0,3


4. For the customer that spent the most (in total over their lifetime as a customer) total_amt_usd,
how many web_events did they have for each channel?

In [26]:
q = """
WITH t1 AS (
   SELECT a.id, a.name, SUM(o.total_amt_usd) tot_spent
   FROM orders o
   JOIN accounts a
   ON a.id = o.account_id
   GROUP BY a.id, a.name
   ORDER BY 3 DESC
   LIMIT 1)
SELECT a.name, w.channel, COUNT(*)
FROM accounts a
JOIN web_events w
ON a.id = w.account_id AND a.id =  (SELECT id FROM t1)
GROUP BY 1, 2
ORDER BY 3 DESC;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,name,channel,count
0,EOG Resources,direct,44
1,EOG Resources,organic,13
2,EOG Resources,adwords,12
3,EOG Resources,facebook,11
4,EOG Resources,twitter,5
5,EOG Resources,banner,4


5. What is the lifetime average amount spent in terms of total_amt_usd for the top 10 total spending accounts?

In [27]:
q = """
WITH t1 AS (
   SELECT a.id, a.name, SUM(o.total_amt_usd) tot_spent
   FROM orders o
   JOIN accounts a
   ON a.id = o.account_id
   GROUP BY a.id, a.name
   ORDER BY 3 DESC
   LIMIT 10)
SELECT AVG(tot_spent)
FROM t1;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,avg
0,304846.969


6. What is the lifetime average amount spent in terms of total_amt_usd,
including only the companies that spent more per order, on average, than the average of all orders.

In [28]:
q = """
WITH t1 AS (
   SELECT AVG(o.total_amt_usd) avg_all
   FROM orders o
   JOIN accounts a
   ON a.id = o.account_id),
t2 AS (
   SELECT o.account_id, AVG(o.total_amt_usd) avg_amt
   FROM orders o
   GROUP BY 1
   HAVING AVG(o.total_amt_usd) > (SELECT * FROM t1))
SELECT AVG(avg_amt)
FROM t2;
"""

table = database.to_data_frame(q)
table

Unnamed: 0,avg
0,4721.139744
