# Data Wrangling 2.4

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import csv

from datetime import datetime as dt

from IPython.display import display, HTML


In [2]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [3]:
cursor = connection.cursor()

In [4]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

# Lab: Data Cleansing - Consistency, Contradictions

## Find sales where the total amount in the sales record does not match the sum of the line items

In [5]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_sales as sa
where total_amount::numeric <> (select sum(quantity::numeric) * 12 
                                from stage_3_line_items as l 
                                where sa.store_id = l.store_id and sa.sale_id = l.sale_id)


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,7,1,681140,4260,2021-08-22,60
1,11,1,696395,1991,2021-09-07,48
2,14,2,548317,11779,2021-05-03,12
3,28,3,597247,15799,2021-07-22,424


## You try it - join stage_3_sales to stage_3_line items to show the details for the contradictions on total_amount

In [6]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_line_items
"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,1,1,560983,1,1,1
1,2,1,560983,2,8,1
2,3,1,577120,1,1,1
3,4,1,577120,2,2,1
4,5,1,577120,3,4,2
...,...,...,...,...,...,...
171,172,5,580412,4,4,1
172,173,5,580412,5,7,1
173,174,5,590790,1,1,1
174,175,5,590790,2,4,2


In [10]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select sa.stage_id,
       sa.store_id,
       sa.sale_id,
       sa.customer_id,
       sa.sale_date,
       sa.total_amount,
       l.line_item_id,
       l.product_id,
       l.quantity
from stage_3_sales as sa
    join stage_3_line_items as l
        on sa.store_id = l.store_id and sa.sale_id = l.sale_id
where total_amount::numeric <> (select sum(quantity::numeric) * 12 
                                from stage_3_line_items as l 
                                where sa.store_id = l.store_id and sa.sale_id = l.sale_id)
order by sa.store_id, sa.sale_id, sa.customer_id

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount,line_item_id,product_id,quantity
0,7,1,681140,4260,2021-08-22,60,3,8,1
1,7,1,681140,4260,2021-08-22,60,2,5,1
2,7,1,681140,4260,2021-08-22,60,1,1,10
3,11,1,696395,1991,2021-09-07,48,2,6,1
4,11,1,696395,1991,2021-09-07,48,1,4,2
5,14,2,548317,11779,2021-05-03,12,1,6,23
6,28,3,597247,15799,2021-07-22,424,5,8,1
7,28,3,597247,15799,2021-07-22,424,4,7,1
8,28,3,597247,15799,2021-07-22,424,3,6,2
9,28,3,597247,15799,2021-07-22,424,2,4,2


# Lab: Data Cleansing - Completeness

## Find incomplete sales without line items

In [11]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_sales as sa
where (sa.store_id, sa.sale_id) not in (select store_id, sale_id from stage_3_line_items)
order by stage_id, store_id, sale_id

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,56,55,590790,27997,2021-09-14,48


## You try it - find line items without a sale

In [12]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_line_items as l
where (store_id, sale_id) not in (select store_id, sale_id from stage_3_sales) 
order by stage_id, store_id, sale_id

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,35,1,696394,3,8,1
1,174,5,590790,1,1,1
2,175,5,590790,2,4,2
3,176,5,590790,3,7,1


# Lab: Data Cleansing - Uniformity

## distance in miles or kilometers in the stage_3_customers table; zip code 37208 has one record in miles and one record in kilometers;  they are rounded to the nearest mile or kilometer

In [13]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select zip, distance
from stage_3_customers
where zip = '37208'

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,zip,distance
0,37208,3
1,37208,2


## You try it - check for uniformity in capitalization in last names in stage_3_customers

In [16]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_customers
where last_name = upper(last_name)

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,36,23347,Roice,FIELDERS,8 Hermina Parkway,Key Bisscain,FL,33149,4,4
1,40,25339,Lucylia,LOGG,62940 Towne Terrace,Miami,FL,33166,4,9
