# Data Warehousing

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2


In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

# Lab: Querying Dimensional Model - Single Star Schema

![Star Schema](star_schema.JPG)

##  Drop, create, and load the tables for the line item star schema

In [5]:
connection.rollback()

query = """

drop table if exists line_item_facts;

drop table if exists receipt_dimension;

drop table if exists date_dimension;

drop table if exists customer_dimension;

drop table if exists store_dimension;

drop table if exists product_dimension;



"""

cursor.execute(query)

connection.commit()


In [6]:
connection.rollback()

query = """

create table receipt_dimension (
  receipt_key numeric(12),
  receipt varchar(32),
  primary key (receipt_key)
);

create table date_dimension (
  date_key numeric(12),
  date_value date,
  dow numeric(1),
  dow_string varchar(9),
  month numeric(2),
  month_string varchar(9),
  primary key (date_key)
);

create table customer_dimension (
  customer_key numeric(12),
  customer_id numeric(6),
  first_name varchar(32),
  last_name varchar(32),
  street varchar(32),
  city varchar(32),
  state varchar(2),
  zip varchar(5),
  distance numeric(3),
  primary key (customer_key)
);


create table store_dimension (
  store_key numeric(12),
  store_id numeric(6),
  street varchar(32),
  city varchar(32),
  state varchar(2),
  zip varchar(5),
  latitude numeric(7,4),
  longitude numeric(7,4),
  primary key (store_key)
);


create table product_dimension (
  product_key numeric(12),
  product_id numeric(3),
  product_name varchar(32),
  primary key (product_key)  
);


create table line_item_facts (
  receipt_key numeric(12),
  date_key numeric(12),
  customer_key numeric(12),
  store_key numeric(12),
  product_key numeric(12),
  quantity numeric(3),
  price numeric(5,2),
  line_item_sub_total numeric(6),
  line_item_tax numeric(6),
  line_item_total numeric(6),
  primary key (receipt_key, date_key, customer_key, store_key, product_key),
  foreign key (receipt_key) references receipt_dimension (receipt_key),
  foreign key (date_key) references date_dimension (date_key),
  foreign key (customer_key) references customer_dimension (customer_key),
  foreign key (store_key) references store_dimension (store_key),
  foreign key (product_key) references product_dimension (product_key)
);


"""

cursor.execute(query)

connection.commit()

In [7]:
connection.rollback()
    
query = """
    

copy receipt_dimension
from '/user/labs/week_14/receipt_dimension.csv' delimiter ',' NULL '' csv header;

copy date_dimension
from '/user/labs/week_14/date_dimension.csv' delimiter ',' NULL '' csv header;

copy customer_dimension
from '/user/labs/week_14/customer_dimension.csv' delimiter ',' NULL '' csv header;

copy store_dimension
from '/user/labs/week_14/store_dimension.csv' delimiter ',' NULL '' csv header;

copy product_dimension
from '/user/labs/week_14/product_dimension.csv' delimiter ',' NULL '' csv header;

copy line_item_facts
from  '/user/labs/week_14/line_item_facts.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)
    
connection.commit()


##  Queries to a star schema always use the same pattern: join the dimensions to the fact table;  use an inner join because "holes" and nulls are not allowed in star shemas

In [8]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select l.quantity,
       l.price,
       l.line_item_sub_total,
       l.line_item_tax,
       l.line_item_total,
       r.receipt_key,
       r.receipt,
       d.date_key,
       d.dow,
       d.dow_string,
       d.month,
       d.month_string,
       c.customer_key,
       c.customer_id,
       c.last_name,
       c.first_name,
       c.street,
       c.city,
       c.state,
       c.zip,
       c.distance,
       s.store_key,
       s.store_id,
       s.street as store_street,
       s.city as store_city,
       s.state as store_state,
       s.zip as store_zip,
       s.latitude,
       s.longitude,
       p.product_key,
       p.product_id,
       p.product_name
from line_item_facts as l
     join receipt_dimension as r
         on l.receipt_key = r.receipt_key
     join date_dimension as d
         on l.date_key = d.date_key
     join customer_dimension as c
         on l.customer_key = c.customer_key
     join store_dimension as s
         on l.store_key = s.store_key
     join product_dimension as p
         on l.product_key = p.product_key
order by r.receipt

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,quantity,price,line_item_sub_total,line_item_tax,line_item_total,receipt_key,receipt,date_key,dow,dow_string,...,store_id,store_street,store_city,store_state,store_zip,latitude,longitude,product_key,product_id,product_name
0,1,12,12,0,12,601941,001-000128112,375044,4,Thursday,...,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604,142,1,Pistachio Salmon
1,1,12,12,0,12,601941,001-000128112,375044,4,Thursday,...,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604,149,8,Brocolli Stir Fry
2,1,12,12,0,12,618078,001-000144249,375060,6,Saturday,...,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604,142,1,Pistachio Salmon
3,1,12,12,0,12,618078,001-000144249,375060,6,Saturday,...,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604,143,2,Teriyaki Chicken
4,2,12,24,0,24,618078,001-000144249,375060,6,Saturday,...,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604,145,4,Eggplant Lasagna
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,1,12,12,0,12,621370,005-000147541,375163,4,Thursday,...,5,1202 Broadway,Nashville,TN,37203,36.1568,-86.7881,145,4,Eggplant Lasagna
172,1,12,12,0,12,621370,005-000147541,375163,4,Thursday,...,5,1202 Broadway,Nashville,TN,37203,36.1568,-86.7881,148,7,Tilapia Piccata
173,1,12,12,0,12,631748,005-000157919,375181,1,Monday,...,5,1202 Broadway,Nashville,TN,37203,36.1568,-86.7881,142,1,Pistachio Salmon
174,2,12,24,0,24,631748,005-000157919,375181,1,Monday,...,5,1202 Broadway,Nashville,TN,37203,36.1568,-86.7881,145,4,Eggplant Lasagna


##  You try it - using the star schema, for each receipt, find the subtotal, tax, and total amounts

In [13]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select r.receipt,
       sum(l.line_item_sub_total) as sub_total,
       sum(l.line_item_tax) as tax,
       sum(l.line_item_total) as total
from line_item_facts as l
     join receipt_dimension as r
         on l.receipt_key = r.receipt_key
group by r.receipt
order by r.receipt

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,receipt,sub_total,tax,total
0,001-000128112,24,0,24
1,001-000144249,84,0,84
2,001-000163141,96,0,96
3,001-000169216,144,0,144
4,001-000179181,48,0,48
5,001-000181897,48,0,48
6,001-000248269,60,0,60
7,001-000250031,24,0,24
8,001-000255285,36,0,36
9,001-000263524,48,0,48


# Lab: Querying Dimensional Model - Drilling Across Multiple Star Schemas

![Drilling Across](drilling_across.JPG)

##  Drop, create, and populate the tables that hold the results of star schema queries on the orders start schema and  the fulfillment star schema

In [14]:
connection.rollback()

query = """

drop table if exists orders;

drop table if exists fulfillment;


"""

cursor.execute(query)

connection.commit()


In [15]:
connection.rollback()

query = """

create table orders (
  order_id numeric(12),
  order_date date,
  sub_total numeric(5),
  tax numeric(5),
  total numeric(5)
);

create table fulfillment (
  fulfillment_id numeric(12),
  fulfillment_date date,
  order_id numeric(12),
  additional_order_id_1 numeric(12),
  additional_order_id_2 numeric(12),
  additional_order_id_3 numeric(12)
);

"""

cursor.execute(query)

connection.commit()

In [16]:
connection.rollback()

query = """

insert into orders values(1, '2020-11-01', 36, 0, 36);
insert into orders values(2, '2020-11-02', 48, 0, 48);
insert into orders values(3, '2020-11-03', 24, 0, 24);
insert into orders values(4, '2020-11-04', 12, 0, 12);
insert into orders values(5, '2020-11-05', 36, 0, 36);
insert into orders values(6, '2020-11-06', 48, 0, 48);

insert into fulfillment values(11, '2020-11-05', 1, 0, 0, 0);
insert into fulfillment values(12, '2020-11-06', 1, 0, 0, 0);
insert into fulfillment values(13, '2020-11-07', 2, 3, 0, 0);
insert into fulfillment values(14, '2020-11-08', 4, 0, 0, 0);
insert into fulfillment values(15, '2020-11-09', 4, 0, 0, 0);
insert into fulfillment values(16, '2020-11-10', 5, 6, 0, 0);

"""

cursor.execute(query)

connection.commit()

##  The star schema orders query results

In [17]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from orders

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total
0,1,2020-11-01,36,0,36
1,2,2020-11-02,48,0,48
2,3,2020-11-03,24,0,24
3,4,2020-11-04,12,0,12
4,5,2020-11-05,36,0,36
5,6,2020-11-06,48,0,48


##  The star schema fulfillment query results

In [18]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from fulfillment

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,fulfillment_id,fulfillment_date,order_id,additional_order_id_1,additional_order_id_2,additional_order_id_3
0,11,2020-11-05,1,0,0,0
1,12,2020-11-06,1,0,0,0
2,13,2020-11-07,2,3,0,0
3,14,2020-11-08,4,0,0,0
4,15,2020-11-09,4,0,0,0
5,16,2020-11-10,5,6,0,0


##  Since orders and fulfillment are both star schemas, there is no defined relationship between them;  Essentially we need to do a "dangerous join" between the two fact tables, which can result in the "extra rows" problem and the "missing rows" problem

##  Let's do a dangerous join between orders and fulfillment and look at order 1;  order 1 was fulfilled (delivered) partially on two different days;  this create "extra rows" when we join them

In [19]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       o.sub_total,
       o.tax,
       o.total,
       f.fulfillment_id,
       f.fulfillment_date
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 1
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total,fulfillment_id,fulfillment_date
0,1,2020-11-01,36,0,36,11,2020-11-05
1,1,2020-11-01,36,0,36,12,2020-11-06


##  If we are not careful, we can double (or multi) count the extra rows, as in an aggregation; the aggregation below has double the sub total, tax, and total

In [21]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       sum(o.sub_total) as sub_total,
       sum(o.tax) as tax,
       sum(o.total) as total
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 1
group by o.order_id, o.order_date
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total
0,1,2020-11-01,72,0,72


##  Fulfillment 13 delivered two orders to the customer on the same day, order 2 and order 3;  star schemas do not allow the fact table to be a parent of the dimension, so we have to denormalize the orders in the fulfillment record;  in our example, order 3 does not have a primary fulfillment record, which leads to the missing row problem

In [22]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       o.sub_total,
       o.tax,
       o.total,
       f.fulfillment_id,
       f.fulfillment_date
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 3
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total,fulfillment_id,fulfillment_date


##  "Drilling Across" is the solution;  drilling across means that we execute an independent query for each star schema, then write procedural code in a language such as Python to combine them with logic to handle the missing rows and extra rows problem; we cannot write SQL to handle a join on start schemas, it will never work!

##  You try it - find another extra rows problem and another missing rows problem when joining the orders star schema and the fulfillment star schema

In [23]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       sum(o.sub_total) as sub_total,
       sum(o.tax) as tax,
       sum(o.total) as total
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 4
group by o.order_id, o.order_date
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total
0,4,2020-11-04,24,0,24


In [24]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       o.sub_total,
       o.tax,
       o.total,
       f.fulfillment_id,
       f.fulfillment_date
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 6
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total,fulfillment_id,fulfillment_date
