# Data Warehousing Solutions

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2


In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

##  You try it - using the star schema, for each receipt, find the subtotal, tax, and total amounts

In [5]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select r.receipt,
       sum(l.line_item_sub_total) as sub_total,
       sum(l.line_item_tax) as tax,
       sum(l.line_item_total) as total
from line_item_facts as l
     join receipt_dimension as r
         on l.receipt_key = r.receipt_key
group by r.receipt
order by r.receipt

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,receipt,sub_total,tax,total
0,001-000128112,24,0,24
1,001-000144249,84,0,84
2,001-000163141,96,0,96
3,001-000169216,144,0,144
4,001-000179181,48,0,48
5,001-000181897,48,0,48
6,001-000248269,60,0,60
7,001-000250031,24,0,24
8,001-000255285,36,0,36
9,001-000263524,48,0,48


##  You try it - find another extra rows problem and another missing rows problem when joining the orders star schema and the fulfillment star schema

##  order 4 has the extra rows problem, as it was fulfilled in fullfillments 14 and 15

In [None]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       o.sub_total,
       o.tax,
       o.total,
       f.fulfillment_id,
       f.fulfillment_date
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 4
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

##  order 4 will be double counted

In [6]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       sum(o.sub_total) as sub_total,
       sum(o.tax) as tax,
       sum(o.total) as total
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 4
group by o.order_id, o.order_date
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total
0,4,2020-11-04,24,0,24


##  fulfillment 15 fulfilled orders 5 and 6, with 5 as primary;  if we query for the fulfillment for order 6, it will be missing

In [7]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select o.order_id,
       o.order_date,
       o.sub_total,
       o.tax,
       o.total,
       f.fulfillment_id,
       f.fulfillment_date
from orders as o
     join fulfillment as f
         on o.order_id = f.order_id
where o.order_id = 6
order by 1

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,order_id,order_date,sub_total,tax,total,fulfillment_id,fulfillment_date
