# Data Wrangling 2.3

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import csv

from datetime import datetime as dt

from IPython.display import display, HTML


In [2]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [3]:
cursor = connection.cursor()

In [4]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

   # Lab: Data Cleansing - Data That Does Not Match Validation Rules

## The invalid sales date of 2021-17-14 will generate an error when we try to convert it from varchar to date

In [5]:
connection.rollback()

query = """

select sale_date::date
from stage_3_sales


"""
cursor.execute(query)

            
    

DatetimeFieldOverflow: date/time field value out of range: "2021-17-14"
HINT:  Perhaps you need a different "datestyle" setting.


## You try it - see if the product_id in line_items has valid numeric data

In [6]:
connection.rollback()

query = """

select product_id::numeric
from stage_3_line_items


"""
cursor.execute(query)

            

InvalidTextRepresentation: invalid input syntax for type numeric: "A"


# Lab: Data Cleansing - Data That Does Not Match Lookup Tables

## product_id's that are not in the products table (we have to filter out the 'A' we just found)

In [7]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

with a as (select * from stage_3_line_items where product_id <> 'A')

select product_id::numeric
from a 
where product_id::numeric not in (select product_id from products)


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,product_id
0,11


## You try it - find customer_id's in the stage_3_sales table that are not in the stage_3_customers table

In [10]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_sales
where customer_id not in (select customer_id from stage_3_customers)


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,41,4,610104,99999,2021-08-26,60


# Lab: Data Cleansing - Data That Violates Referential Integrity

## Find line items without a sales record

In [11]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_line_items
where (store_id, sale_id) not in (select store_id, sale_id from stage_3_sales)


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,35,1,696394,3,8,1
1,174,5,590790,1,1,1
2,175,5,590790,2,4,2
3,176,5,590790,3,7,1


## You try it - find store_id's in the stage_3_sales table that are not in the stores table

In [15]:
rollback_before_flag = True
rollback_after_flag = True

query  = """

select *
from stage_3_sales
where store_id::numeric not in (select store_id from stores)


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,56,55,590790,27997,2021-09-14,48
