In [1]:
import itertools as it
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# plotting options
%matplotlib inline
np.set_printoptions(linewidth=250)
plt.rc('font'  , size=18)
plt.rc('figure', figsize=(10, 8))
plt.rc('axes'  , labelsize=22)
plt.rc('legend', fontsize=16)

np.set_printoptions(precision=3)
plt.rc('figure', figsize=(10, 8))

In [2]:
os.chdir('%s/examples/drills' % os.getenv('DST'))
pwd = os.getcwd()
print(pwd)

/home/cloudera/Development/dst/examples/drills


In [3]:
import os
spark_home = os.environ.get('SPARK_HOME', None)
spark_home

'/usr/lib/spark'

In [4]:
from pyspark import SparkContext, SparkConf, SQLContext

from pyspark import SparkContext, SparkConf, SQLContext, HiveContext

myConf = SparkConf().setAppName('TestApp')\
                    .set('spark.executor.memory', '2G')\
                    .set('spark.hadoop.validateOutputSpecs', 'false')

sc      = SparkContext(conf=myConf)
sql_ctx = HiveContext(sc)

## Load Tables

In [20]:
customers_df = sql_ctx.load(
    source      = 'com.databricks.spark.csv',
    header      = 'true',
    inferSchema = 'true',
    path        = 'file:%s/customers.txt'%pwd
)
customers_df.registerTempTable('customers')
customers_df.toPandas().head()

Unnamed: 0,customerid,firstname,lastname,city,state
0,10101,John,Gray,Lynden,Washington
1,10298,Leroy,Brown,Pinetop,Arizona
2,10299,Elroy,Keller,Snoqualmie,Washington
3,10315,Lisa,Jones,Oshkosh,Wisconsin
4,10325,Ginger,Schultz,Pocatello,Idaho


In [21]:
sql_ctx.sql('select * from customers limit 3').show(5)

customerid firstname lastname city       state     
10101      John      Gray     Lynden     Washington
10298      Leroy     Brown    Pinetop    Arizona   
10299      Elroy     Keller   Snoqualmie Washington


In [22]:
items_df = sql_ctx.load(
    source      = 'com.databricks.spark.csv',
    header      = 'true',
    inferSchema = 'true',
    path        = 'file:%s/items_ordered.txt'%pwd
)
items_df.registerTempTable('items')
items_df.toPandas().head()

Unnamed: 0,customerid,order_date,item,quantity,price
0,10330,30-Jun-1999,Pogo stick,1,28
1,10101,30-Jun-1999,Raft,1,58
2,10298,01-Jul-1999,Skateboard,1,33
3,10101,01-Jul-1999,Life Vest,4,125
4,10299,06-Jul-1999,Parachute,1,1250


In [23]:
sql_ctx.sql('select * from items limit 3').show(5)

customerid order_date  item       quantity price
10330      30-Jun-1999 Pogo stick 1        28.0 
10101      30-Jun-1999 Raft       1        58.0 
10298      01-Jul-1999 Skateboard 1        33.0 


In [24]:
def run_sql(sql):
    return sql_ctx.sql(sql).toPandas()

In [36]:
run_sql('select * from items')

Unnamed: 0,customerid,order_date,item,quantity,price
0,10330,30-Jun-1999,Pogo stick,1,28.0
1,10101,30-Jun-1999,Raft,1,58.0
2,10298,01-Jul-1999,Skateboard,1,33.0
3,10101,01-Jul-1999,Life Vest,4,125.0
4,10299,06-Jul-1999,Parachute,1,1250.0
5,10339,27-Jul-1999,Umbrella,1,4.5
6,10449,13-Aug-1999,Unicycle,1,180.79
7,10439,14-Aug-1999,Ski Poles,2,25.5
8,10101,18-Aug-1999,Rain Coat,1,18.3
9,10449,01-Sep-1999,Snow Shoes,1,45.0


## select statements

1. From the items_ordered table, select a list of all items purchased for customerid 10449. Display the customerid, item, and price for this customer.
1. Select all columns from the items_ordered table for whoever purchased a Tent.
1. Select the customerid, order_date, and item values from the items_ordered table for any items in the item column that start with the letter "S".
1. Select the distinct items in the items_ordered table. In other words, display a listing of each of the unique items from the items_ordered table.
1. Make up your own select statements and submit them.

In [33]:
# question 1
run_sql("""\
select
    customerid
  , item
  , price
from items
where customerid=10449
""")

Unnamed: 0,customerid,item,price
0,10449,Unicycle,180.79
1,10449,Snow Shoes,45.0
2,10449,Bicycle,380.5
3,10449,Canoe,280.0
4,10449,Flashlight,4.5
5,10449,Canoe paddle,40.0


In [34]:
# question 2

run_sql("""\
select *
from items
where item='Tent'
""")

Unnamed: 0,customerid,order_date,item,quantity,price
0,10439,18-Sep-1999,Tent,1,88.0
1,10438,18-Jan-2000,Tent,1,79.99


In [38]:
# question 3

run_sql("""\
select 
    customerid
  , order_date
  , item
from items
where item like 'S%'
""")

Unnamed: 0,customerid,order_date,item
0,10298,01-Jul-1999,Skateboard
1,10439,14-Aug-1999,Ski Poles
2,10449,01-Sep-1999,Snow Shoes
3,10410,28-Oct-1999,Sleeping Bag
4,10101,08-Mar-2000,Sleeping Bag
5,10330,19-Apr-2000,Shovel


In [45]:
# question 4

run_sql("""\
select 
    c.firstname
  , c.lastname
  , i.order_date
  , i.item
from items i
  inner join customers c on c.customerid=i.customerid
where i.item like 'S%'
""")

Unnamed: 0,firstname,lastname,order_date,item
0,Conrad,Giles,14-Aug-1999,Ski Poles
1,Isabela,Moore,01-Sep-1999,Snow Shoes
2,Leroy,Brown,01-Jul-1999,Skateboard
3,John,Gray,08-Mar-2000,Sleeping Bag
4,Shawn,Dalton,19-Apr-2000,Shovel
5,Mary Ann,Howell,28-Oct-1999,Sleeping Bag


## Aggregate Statements

* MIN	returns the smallest value in a given column
* MAX	returns the largest value in a given column
* SUM	returns the sum of the numeric values in a given column
* AVG	returns the average value of a given column
* COUNT	returns the total number of values in a given column
* COUNT(*)	returns the number of rows in a table

### questions:

1. Select the maximum price of any item ordered in the items_ordered table. Hint: Select the maximum price only.
1. Select the average price of all of the items ordered that were purchased in the month of Dec.
1. What are the total number of rows in the items_ordered table?
1. For all of the tents that were ordered in the items_ordered table, what is the price of the lowest tent? Hint: Your query should return the price only.

In [49]:
# question 1

run_sql("""\
select max(price) as max_price
from items
""")

Unnamed: 0,max_price
0,1250


In [50]:
# question 2

run_sql("""\
select avg(price) as avg_price
from items
where order_date like '%Dec%'
""")

Unnamed: 0,avg_price
0,174.3125


In [51]:
# question 3

run_sql("""\
select count(*) as row_count
from items
""")

Unnamed: 0,row_count
0,32


In [52]:
# question 4

run_sql("""\
select min(price) as cheapest_tent
from items
where item = 'Tent'
""")

Unnamed: 0,cheapest_tent
0,79.99


## Group By clause

1. How many people are in each unique state in the customers table? Select the state and display the number of people in each. Hint: count is used to count rows in a column, sum works on numeric data only.
1. From the items_ordered table, select the item, maximum price, and minimum price for each specific item in the table. Hint: The items will need to be broken up into separate groups.
1. How many orders did each customer make? Use the items_ordered table. Select the customerid, number of orders they made, and the sum of their orders. Click the Group By answers link below if you have any problems.

In [60]:
# question 1

run_sql("""\
select
    distinct state
  , count(1) as state_count
from customers
group by state
order by state_count desc
""")

Unnamed: 0,state,state_count
0,Arizona,6
1,Oregon,2
2,Washington,2
3,Colorado,2
4,South Carolina,1
5,Idaho,1
6,North Carolina,1
7,Wisconsin,1
8,Hawaii,1


In [56]:
# question 2

run_sql("""\
select
    item
  , max(price) as max_price
  , min(price) as min_price
from items
group by item
""")

Unnamed: 0,item,max_price,min_price
0,Tent,88.0,79.99
1,Parachute,1250.0,1250.0
2,Lantern,29.0,16.0
3,Pogo stick,28.0,28.0
4,Umbrella,6.75,4.5
5,Shovel,16.75,16.75
6,Pillow,8.5,8.5
7,Life Vest,125.0,125.0
8,Canoe,280.0,280.0
9,Inflatable Mattress,38.0,38.0


In [58]:
# question 3

run_sql("""\
select
    customerid
  , count(1) as num_orders
  , sum(price) as total
from items
group by customerid
""")

Unnamed: 0,customerid,num_orders,total
0,10438,3,95.24
1,10439,2,113.5
2,10449,6,930.79
3,10298,5,118.88
4,10299,2,1288.0
5,10101,6,320.75
6,10315,1,8.0
7,10330,3,72.75
8,10339,1,4.5
9,10410,2,281.72


## Having Clause

1. How many people are in each unique state in the customers table that have more than one person in the state? Select the state and display the number of how many people are in each if it's greater than 1.
1. From the items_ordered table, select the item, maximum price, and minimum price for each specific item in the table. Only display the results if the maximum price for one of the items is greater than 190.00.
1. How many orders did each customer make? Use the items_ordered table. Select the customerid, number of orders they made, and the sum of their orders if they purchased more than 1 item.

In [61]:
# question 1

run_sql("""\
select
    distinct state
  , count(1) as state_count
from customers
group by state
having state_count>1
order by state_count desc
""")

Unnamed: 0,state,state_count
0,Arizona,6
1,Oregon,2
2,Washington,2
3,Colorado,2


In [62]:
# question 2

run_sql("""\
select
    item
  , max(price) as max_price
  , min(price) as min_price
from items
group by item
having max_price > 190.0
""")

Unnamed: 0,item,max_price,min_price
0,Parachute,1250.0,1250.0
1,Canoe,280.0,280.0
2,Bicycle,380.5,380.5
3,Unicycle,192.5,180.79


In [63]:
# question 3

run_sql("""\
select
    customerid
  , count(1) as num_orders
  , sum(price) as total
from items
group by customerid
having num_orders > 1
""")

Unnamed: 0,customerid,num_orders,total
0,10438,3,95.24
1,10439,2,113.5
2,10449,6,930.79
3,10298,5,118.88
4,10299,2,1288.0
5,10101,6,320.75
6,10330,3,72.75
7,10410,2,281.72


## Order by

1. Select the lastname, firstname, and city for all customers in the customers table. Display the results in Ascending Order based on the lastname.
1. Same thing as exercise #1, but display the results in Descending order.
1. Select the item and price for all of the items in the items_ordered table that the price is greater than 10.00. Display the results in Ascending order based on the price.

In [64]:
# question 1

run_sql("""\
select
    firstname
  , lastname
  , city
from customers
order by lastname asc
""")

Unnamed: 0,firstname,lastname,city
0,Leroy,Brown,Pinetop
1,Elroy,Cleaver,Globe
2,Shawn,Dalton,Cannon Beach
3,Donald,Davids,Gila Bend
4,Conrad,Giles,Telluride
5,Sarah,Graham,Greensboro
6,John,Gray,Lynden
7,Michael,Howell,Tillamook
8,Mary Ann,Howell,Charleston
9,Lisa,Jones,Oshkosh


In [65]:
# question 2

run_sql("""\
select
    firstname
  , lastname
  , city
from customers
order by lastname desc
""")

Unnamed: 0,firstname,lastname,city
0,Kevin,Smith,Durango
1,Ginger,Schultz,Pocatello
2,Anthony,Sanchez,Winslow
3,Linda,Sakahara,Nogales
4,Isabela,Moore,Yuma
5,Kelly,Mendoza,Kailua
6,Elroy,Keller,Snoqualmie
7,Lisa,Jones,Oshkosh
8,Michael,Howell,Tillamook
9,Mary Ann,Howell,Charleston


In [66]:
# question 3

run_sql("""\
select
    item
  , price
from items
where price > 10.0
order by price asc
""")

Unnamed: 0,item,price
0,Ear Muffs,12.5
1,Hoola Hoop,14.75
2,Lantern,16.0
3,Shovel,16.75
4,Rain Coat,18.3
5,Helmet,22.0
6,Pocket Knife,22.38
7,Ski Poles,25.5
8,Pogo stick,28.0
9,Flashlight,28.0


## Combining Conditions & Boolean Operators

1. Select the customerid, order_date, and item from the items_ordered table for all items unless they are 'Snow Shoes' or if they are 'Ear Muffs'. Display the rows as long as they are not either of these two items.
1. Select the item and price of all items that start with the letters 'S', 'P', or 'F'.

In [75]:
# question 1

run_sql("""\
select
    customerid
  , order_date
  , item
from items
where (item != 'Ear Muffs') and (item != 'Snow Shoes')
order by item
""")

Unnamed: 0,customerid,order_date,item
0,10449,15-Dec-1999,Bicycle
1,10449,22-Dec-1999,Canoe
2,10449,19-Mar-2000,Canoe paddle
3,10315,2-Feb-2000,Compass
4,10330,01-Jan-2000,Flashlight
5,10449,29-Feb-2000,Flashlight
6,10298,01-Dec-1999,Helmet
7,10101,30-Dec-1999,Hoola Hoop
8,10299,18-Jan-2000,Inflatable Mattress
9,10298,19-Sep-1999,Lantern


In [77]:
# question 2

run_sql("""\
select
    item
  , price
from items
where  
     (item like 'S%')
  or (item like 'P%')
  or (item like 'F%')
order by price desc
""")

Unnamed: 0,item,price
0,Parachute,1250.0
1,Sleeping Bag,89.22
2,Sleeping Bag,88.7
3,Snow Shoes,45.0
4,Skateboard,33.0
5,Pogo stick,28.0
6,Flashlight,28.0
7,Ski Poles,25.5
8,Pocket Knife,22.38
9,Shovel,16.75


## IN & BETWEEN

1. Select the date, item, and price from the items_ordered table for all of the rows that have a price value ranging from 10.00 to 80.00.
1. Select the firstname, city, and state from the customers table for all of the rows where the state value is either: Arizona, Washington, Oklahoma, Colorado, or Hawaii.

In [78]:
run_sql("""\
select
    customerid
  , order_date
  , item
from items
where item not in ('Ear Muffs', 'Snow Shoes')
order by item
""")

Unnamed: 0,customerid,order_date,item
0,10449,15-Dec-1999,Bicycle
1,10449,22-Dec-1999,Canoe
2,10449,19-Mar-2000,Canoe paddle
3,10315,2-Feb-2000,Compass
4,10330,01-Jan-2000,Flashlight
5,10449,29-Feb-2000,Flashlight
6,10298,01-Dec-1999,Helmet
7,10101,30-Dec-1999,Hoola Hoop
8,10299,18-Jan-2000,Inflatable Mattress
9,10298,19-Sep-1999,Lantern


In [80]:
# question 1
run_sql("""\
select
    order_date
  , item
  , price
from items
where price between 10.0 and 80.0
""")

Unnamed: 0,order_date,item,price
0,30-Jun-1999,Pogo stick,28.0
1,30-Jun-1999,Raft,58.0
2,01-Jul-1999,Skateboard,33.0
3,14-Aug-1999,Ski Poles,25.5
4,18-Aug-1999,Rain Coat,18.3
5,01-Sep-1999,Snow Shoes,45.0
6,19-Sep-1999,Lantern,29.0
7,01-Dec-1999,Helmet,22.0
8,30-Dec-1999,Hoola Hoop,14.75
9,01-Jan-2000,Flashlight,28.0


In [81]:
# question 2
run_sql("""\
select
    firstname
  , lastname
  , city
  , state
from customers
where state in ('Arizona', 'Washington', 'Oklahoma', 'Colorado', 'Hawaii')
""")

Unnamed: 0,firstname,lastname,city,state
0,John,Gray,Lynden,Washington
1,Leroy,Brown,Pinetop,Arizona
2,Elroy,Keller,Snoqualmie,Washington
3,Kelly,Mendoza,Kailua,Hawaii
4,Anthony,Sanchez,Winslow,Arizona
5,Elroy,Cleaver,Globe,Arizona
6,Donald,Davids,Gila Bend,Arizona
7,Linda,Sakahara,Nogales,Arizona
8,Kevin,Smith,Durango,Colorado
9,Conrad,Giles,Telluride,Colorado


## Mathematical Functions

Standard ANSI SQL-92 supports the following first four basic arithmetic operators:

### arithmatic

* $+$ addition
* $-$ subtraction
* $*$ multiplication
* $/$ division
* % modulo

### functions

* ABS(x)	returns the absolute value of x
* SIGN(x)	returns the sign of input x as -1, 0, or 1 (negative, zero, or positive respectively)
* MOD(x,y)	modulo - returns the integer remainder of x divided by y (same as x%y)
* FLOOR(x)	returns the largest integer value that is less than or equal to x
* CEILING(x) or CEIL(x)	returns the smallest integer value that is greater than or equal to x
* POWER(x,y)	returns the value of x raised to the power of y
* ROUND(x)	returns the value of x rounded to the nearest whole integer
* ROUND(x,d)	returns the value of x rounded to the number of decimal places specified by the value d
* SQRT(x)	returns the square-root value of x

### questions
1. Select the item and per unit price for each item in the items_ordered table. Hint: Divide the price by the quantity.

In [88]:
# question 2
run_sql("""\
select
    item
  , sum(price)/sum(quantity) as price_per_unit
from items
group by item
""")

Unnamed: 0,item,price_per_unit
0,Tent,83.995
1,Parachute,1250.0
2,Lantern,15.0
3,Pogo stick,28.0
4,Umbrella,5.625
5,Shovel,16.75
6,Pillow,8.5
7,Life Vest,31.25
8,Canoe,280.0
9,Inflatable Mattress,38.0


## Joins

1. Write a query using a join to determine which items were ordered by each of the customers in the customers table. Select the customerid, firstname, lastname, order_date, item, and price for everything each customer purchased in the items_ordered table.
1. Repeat exercise #1, however display the results sorted by state in descending order.

In [92]:
# question 1
run_sql("""\
select
    c.customerid
  , c.firstname
  , c.lastname
  , i.order_date
  , i.item
  , i.price
from customers c
  inner join items i where c.customerid=i.customerid
""")

Unnamed: 0,customerid,firstname,lastname,order_date,item,price
0,10438,Kevin,Smith,01-Nov-1999,Umbrella,6.75
1,10438,Kevin,Smith,02-Nov-1999,Pillow,8.5
2,10438,Kevin,Smith,18-Jan-2000,Tent,79.99
3,10439,Conrad,Giles,14-Aug-1999,Ski Poles,25.5
4,10439,Conrad,Giles,18-Sep-1999,Tent,88.0
5,10449,Isabela,Moore,13-Aug-1999,Unicycle,180.79
6,10449,Isabela,Moore,01-Sep-1999,Snow Shoes,45.0
7,10449,Isabela,Moore,15-Dec-1999,Bicycle,380.5
8,10449,Isabela,Moore,22-Dec-1999,Canoe,280.0
9,10449,Isabela,Moore,29-Feb-2000,Flashlight,4.5


In [93]:
# question 2
run_sql("""\
select
    c.customerid
  , c.firstname
  , c.lastname
  , i.order_date
  , i.item
  , i.price
from customers c
  inner join items i where c.customerid=i.customerid
order by c.state desc
""")

Unnamed: 0,customerid,firstname,lastname,order_date,item,price
0,10315,Lisa,Jones,2-Feb-2000,Compass,8.0
1,10299,Elroy,Keller,06-Jul-1999,Parachute,1250.0
2,10299,Elroy,Keller,18-Jan-2000,Inflatable Mattress,38.0
3,10101,John,Gray,30-Jun-1999,Raft,58.0
4,10101,John,Gray,01-Jul-1999,Life Vest,125.0
5,10101,John,Gray,18-Aug-1999,Rain Coat,18.3
6,10101,John,Gray,30-Dec-1999,Hoola Hoop,14.75
7,10101,John,Gray,02-Jan-2000,Lantern,16.0
8,10101,John,Gray,08-Mar-2000,Sleeping Bag,88.7
9,10410,Mary Ann,Howell,28-Oct-1999,Sleeping Bag,89.22
