# Data Wrangling 2.2

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import csv

from datetime import datetime as dt

from IPython.display import display, HTML


from jellyfish import soundex, levenshtein_distance

from fuzzywuzzy import fuzz

from fuzzywuzzy import process as fuzz_process


In [2]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [3]:
cursor = connection.cursor()

In [4]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

# Lab: Data Cleansing - Fuzzy Logic, Soundex, Levenshtein Distances

## stage_3 tables will hold dirty data to allow us to see how to detect and clean it

In [5]:
connection.rollback()

query = """

drop table if exists stage_3_customers;
drop table if exists stage_3_sales;
drop table if exists stage_3_line_items;



"""

cursor.execute(query)

connection.commit()



In [6]:
#
# create staging tables with all varchar(100)
#

connection.rollback()

query = """


create table stage_3_customers (
  stage_id serial,
  customer_id varchar(100),
  first_name varchar(100),
  last_name varchar(100),
  street varchar(100),
  city varchar(100),
  state varchar(100),
  zip varchar(100),
  closest_store_id varchar(100),
  distance varchar(100)
);

create table stage_3_sales (
  stage_id serial,
  store_id varchar(100),
  sale_id varchar(100),
  customer_id varchar(100),
  sale_date varchar(100),
  total_amount varchar(100)
);

create table stage_3_line_items (
  stage_id serial,
  store_id varchar(100),
  sale_id varchar(100),
  line_item_id varchar(100),
  product_id varchar(100),
  quantity varchar(100)
);

"""

cursor.execute(query)

connection.commit()



In [7]:
connection.rollback()

query = """

copy stage_3_customers (customer_id, first_name, last_name, street, city, state, zip, closest_store_id, distance)
from '/user/labs/week_07/dirty_data/dirty_customers.csv' delimiter ',' NULL '' csv header;

copy stage_3_sales (store_id, sale_id, customer_id, sale_date, total_amount)
from '/user/labs/week_07/dirty_data/dirty_sales.csv' delimiter ',' NULL '' csv header;

copy stage_3_line_items (store_id, sale_id, line_item_id, product_id, quantity)
from '/user/labs/week_07/dirty_data/dirty_line_items.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)

connection.commit()

In [8]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_3_customers
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,1,563,Roze,Slimings,38 Iowa Street,Berkeley,CA,94704,1,1
1,2,1597,Nory,Macauley,654 Sommers Plaza,Oakland,CA,94612,1,3
2,3,1958,Theresina,Penswick,5975 Twin Pines Hill,Berkely,CA,94707,1,3
3,4,1991,Kavon,Wickett,472 Arizona Court,Berkeley,CA,94707,1,3
4,5,3491,Siouxie,M'Quharge,747 Westridge Center,Alemeda,CA,94501,1,6
5,6,4159,Cheril,Broe,7 Ruskin Alley,El Sobrante,CA,94803,1,7
6,7,4198,Andreana,Drew,11039 Cordelia Alley,El Sobrante,CA,94803,1,7
7,8,4260,Dom,Risbrough,3 Northland Crossing,Ritchmond,CA,94805,1,7
8,9,5394,Kathirina,Bavester,522 Cordelia Lane,Aan Francisco,CA,94102,1,10
9,10,6782,Lynd,Iuorio,4 Thackeray Road,,CA,94596,1,12


In [9]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_3_sales
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,1,1,560983,3491,2021-04-30,24
1,2,1,577120,1597,2021-05-16,84
2,3,1,596012,4159,2021-06-04,96
3,4,1,602087,4198,2021-06-09,144
4,5,1,612052,5394,2021-06-18,48
5,6,1,614768,1958,2021-06-20,48
6,7,1,681140,4260,2021-08-22,60
7,8,1,682902,6782,2021-08-23,24
8,9,1,682902,6782,2021-08-23,24
9,10,1,688156,563,2021-08-29,36


In [10]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_3_line_items
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,1,1,560983,1,1,1
1,2,1,560983,2,8,1
2,3,1,577120,1,1,1
3,4,1,577120,2,2,1
4,5,1,577120,3,4,2
...,...,...,...,...,...,...
171,172,5,580412,4,4,1
172,173,5,580412,5,7,1
173,174,5,590790,1,1,1
174,175,5,590790,2,4,2


## Soundex - starts with a letter with the phoenetic sound, followed by two digits for the phoentic sounds of the remaining consonants

In [11]:
soundex("Berkeley")

'B624'

In [12]:
soundex("Berkely")

'B624'

In [13]:
soundex("Berklie")

'B624'

In [14]:
soundex("Barkly")

'B624'

In [15]:
soundex("Verkeley")

'V624'

In [16]:
soundex("there")

'T600'

In [17]:
soundex("their")

'T600'

In [18]:
soundex("Phoenix")

'P520'

In [19]:
soundex("fenix")

'F520'

## Postgres also has a soundex function

In [20]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select first_name,
       soundex(first_name) as soundex_first_name,
       last_name,
       soundex(last_name) as soundex_last_name,
       city,
       soundex(city) as soundex_city
from stage_3_customers
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,first_name,soundex_first_name,last_name,soundex_last_name,city,soundex_city
0,Roze,R200,Slimings,S455,Berkeley,B624
1,Nory,N600,Macauley,M240,Oakland,O245
2,Theresina,T625,Penswick,P522,Berkely,B624
3,Kavon,K150,Wickett,W230,Berkeley,B624
4,Siouxie,S200,M'Quharge,M262,Alemeda,A453
5,Cheril,C640,Broe,B600,El Sobrante,E421
6,Andreana,A536,Drew,D600,El Sobrante,E421
7,Dom,D500,Risbrough,R216,Ritchmond,R325
8,Kathirina,K365,Bavester,B123,Aan Francisco,A516
9,Lynd,L530,Iuorio,I600,,


## Levenshtein Distance - distance is the number of character insertions, character deletions, and character changes to make the strings match

In [21]:
levenshtein_distance("Berkeley", "Berkely")

1

In [22]:
levenshtein_distance("Berkeley", "Berklie")

3

In [23]:
levenshtein_distance("Berkeley", "Verkeley")

1

In [24]:
levenshtein_distance("apples", "oranges")

5

## Using Levenshtein Distances to measure string differences in string kernels in machine learning 

In [25]:
dna_strand_1 = "CCT CTT TGC ACT CGG ATC GTA CGC TAT TCT ATG ATT ACA CGG TTG CGA TCC ATA"

dna_strand_2 = "TCC CTT GGG GAA TAT ACA CGC TGG CTT ACT CGA ATT TGA CTC GTA CTC GCC ATC"

levenshtein_distance(dna_strand_1, dna_strand_2)

32

## Postgres also has a Levenshtein Distance function

In [26]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select first_name,
       last_name,
       levenshtein(first_name, last_name)
from stage_3_customers
where first_name is not null and last_name is not null
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,first_name,last_name,levenshtein
0,Roze,Slimings,8
1,Nory,Macauley,7
2,Theresina,Penswick,7
3,Kavon,Wickett,7
4,Siouxie,M'Quharge,7
5,Cheril,Broe,5
6,Andreana,Drew,6
7,Dom,Risbrough,8
8,Kathirina,Bavester,8
9,Lynd,Iuorio,6


## Fuzzy Logic - 100 is perfect match

In [27]:
fuzz.ratio("Berkeley", "Berkeley")

100

In [28]:
fuzz.ratio("Berkeley", "Berkely")

93

In [29]:
fuzz.ratio("Berkeley", "Verkeley")

88

In [30]:
fuzz.ratio("Go Bears", "Go Bears!!!")

84

In [31]:
fuzz.partial_ratio("Go Bears", "Go Bears!!!")

100

In [32]:
fuzz.ratio("Oski the Bear is our mascot", "Our mascot is the Bear Oski")

52

In [33]:
fuzz.token_sort_ratio("Oski the Bear is our mascot", "Our mascot is the Bear Oski")

100

In [34]:
fuzz.token_sort_ratio("Oski the Bear", "Our mascot is the Bear Oski")

65

In [35]:
fuzz.token_sort_ratio("Go Bears!!!", "Our mascot is the Bear Oski")

40

In [36]:
choices = ["Berkeley", "San Francisco", "San Jose", "Portland", "Seattle", "Los Angeles"]

In [37]:
fuzz_process.extract("san fran", choices, limit=2)

[('San Francisco', 90), ('San Jose', 52)]

In [38]:
fuzz_process.extract("frisco", choices, limit=2)

[('San Francisco', 63), ('San Jose', 29)]

In [39]:
fuzz_process.extract("Dallas", choices, limit=2)

[('Los Angeles', 35), ('San Francisco', 32)]

In [40]:
fuzz_process.extractOne("san fran", choices)

('San Francisco', 90)

In [41]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select cu.stage_id,
       cu.city as stage_city,
       z.city as zip_codes_city
from stage_3_customers as cu
     join zip_codes as z
         on cu.zip = z.zip
where cu.city <> z.city
order by customer_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,stage_city,zip_codes_city
0,19,Kircland,Kirkland
1,26,Dalas,Dallas
2,3,Berkely,Berkeley
3,31,Meskite,Mesquite
4,36,Key Bisscain,Key Biscayne
5,45,Nashvil,Nashville
6,53,Madesun,Madison
7,5,Alemeda,Alameda
8,8,Ritchmond,Richmond
9,9,Aan Francisco,San Francisco


## Find the misspelled cities and explore soundex, Levenshtein distances, and fuzzy logic 

In [42]:

connection.rollback()

query = """

select cu.stage_id,
       cu.city as stage_city,
       z.city as zip_codes_city
from stage_3_customers as cu
     join zip_codes as z
         on cu.zip = z.zip
where cu.city <> z.city
order by stage_id
;

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
for row in rows:
        print("---------------------------------------------------------")
        print("Wrong:", row[1], "soundex", soundex(row[1]))
        print("Right:", row[2], "soundex", soundex(row[2]))
        print("Levenshtein Distance:", levenshtein_distance(row[1], row[2]))
        print("Fuzzy: ratio:", fuzz.ratio(row[1], row[2]))
        print("Fuzzy: partial ratio:", fuzz.partial_ratio(row[1], row[2]))
        print("Fuzzy: token sort ratio:", fuzz.partial_ratio(row[1], row[2]))
        

---------------------------------------------------------
Wrong: Berkely soundex B624
Right: Berkeley soundex B624
Levenshtein Distance: 1
Fuzzy: ratio: 93
Fuzzy: partial ratio: 86
Fuzzy: token sort ratio: 86
---------------------------------------------------------
Wrong: Alemeda soundex A453
Right: Alameda soundex A453
Levenshtein Distance: 1
Fuzzy: ratio: 86
Fuzzy: partial ratio: 86
Fuzzy: token sort ratio: 86
---------------------------------------------------------
Wrong: Ritchmond soundex R325
Right: Richmond soundex R255
Levenshtein Distance: 1
Fuzzy: ratio: 94
Fuzzy: partial ratio: 88
Fuzzy: token sort ratio: 88
---------------------------------------------------------
Wrong: Aan Francisco soundex A516
Right: San Francisco soundex S516
Levenshtein Distance: 1
Fuzzy: ratio: 92
Fuzzy: partial ratio: 92
Fuzzy: token sort ratio: 92
---------------------------------------------------------
Wrong: Seatle soundex S340
Right: Seattle soundex S340
Levenshtein Distance: 1
Fuzzy: ratio: 9

In [43]:

connection.rollback()

query = """

select distinct city
from cities
order by 1
;

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
city_list = []
    
for row in rows:
        city_list.append(row[0])
        
print(city_list[:100])

['Aaronsburg', 'Abbeville', 'Abbot', 'Abbotsford', 'Abbott', 'Abbottstown', 'Abbyville', 'Abell', 'Abercrombie', 'Aberdeen', 'Aberdeen Proving Ground', 'Abernathy', 'Abie', 'Abilene', 'Abingdon', 'Abington', 'Abiquiu', 'Abita Springs', 'Abrams', 'Absaraka', 'Absarokee', 'Absecon', 'Acampo', 'Accident', 'Accokeek', 'Accomac', 'Accord', 'Accoville', 'Ace', 'Achille', 'Ackerly', 'Ackerman', 'Ackley', 'Ackworth', 'Acme', 'Acosta', 'Acra', 'Acton', 'Acushnet', 'Acworth', 'Ada', 'Adah', 'Adair', 'Adairsville', 'Adairville', 'Adak', 'Adamant', 'Adams', 'Adamsburg', 'Adams Center', 'Adams Run', 'Adamstown', 'Adamsville', 'Addieville', 'Addington', 'Addis', 'Addison', 'Addy', 'Addyston', 'Adel', 'Adelanto', 'Adell', 'Adelphi', 'Adena', 'Adger', 'Adin', 'Adirondack', 'Adjuntas', 'Adkins', 'Admire', 'Adolphus', 'Adona', 'Adrian', 'Advance', 'Afton', 'Agar', 'Agate', 'Agawam', 'Agency', 'Agenda', 'Ages Brookside', 'Agness', 'Agoura Hills', 'Agra', 'Aguada', 'Aguadilla', 'Agua Dulce', 'Aguanga', 'A

In [44]:
# Find the best match with bad cities
connection.rollback()

query = """

select cu.stage_id,
       cu.city as stage_city,
       z.city as zip_codes_city
from stage_3_customers as cu
     join zip_codes as z
         on cu.zip = z.zip
where cu.city <> z.city
order by customer_id
;

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
for row in rows:
        print("---------------------------------------------------------")
        print("Wrong:", row[1], "soundex", soundex(row[1]))
        print("Right:", row[2], "soundex", soundex(row[2]))
        print("Levenshtein Distance:", levenshtein_distance(row[1], row[2]))
        print("Fuzzy top 5 choices:", fuzz_process.extract(row[1], city_list, limit=5))
        

---------------------------------------------------------
Wrong: Kircland soundex K624
Right: Kirkland soundex K624
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Kirkland', 88), ('Kirtland', 88), ('Ireland', 80), ('Lando', 80), ('Kirtland Afb', 79)]
---------------------------------------------------------
Wrong: Dalas soundex D420
Right: Dallas soundex D420
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Dallas', 91), ('Atlas', 80), ('Calais', 73), ('Dacula', 73), ('Idalia', 73)]
---------------------------------------------------------
Wrong: Berkely soundex B624
Right: Berkeley soundex B624
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Berkeley', 93), ('Berkey', 92), ('Ely', 90), ('Berkley', 86), ('Berkeley Heights', 77)]
---------------------------------------------------------
Wrong: Meskite soundex M230
Right: Mesquite soundex M230
Levenshtein Distance: 2
Fuzzy top 5 choices: [('Kite', 90), ('Mesquite', 80), ('Erskine', 71), ('Ames', 68), ('Esko', 68)]
-----------------------

## You try it - Repeat for bad customer first names

In [45]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select cu1.stage_id,
       cu1.first_name as stage_first_name,
       cu2.first_name as customer_first_name
from stage_3_customers as cu1
    join customers as cu2
        on cu1.customer_id::numeric = cu2.customer_id
where cu1.first_name <> cu2.first_name
order by stage_id

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,stage_first_name,customer_first_name
0,1,Roze,Rose
1,2,Nory,Norry
2,4,Kavon,Kevon
3,6,Cheril,Cheryl
4,9,Kathirina,Katharina
5,10,Lynd,Lyndsay
6,14,Marcus,Markos
7,15,Deby,Debby
8,22,Thed,Ted
9,23,Zachariah,Zackariah


In [46]:
connection.rollback()

query = """

select cu1.stage_id,
       cu1.first_name as stage_first_name,
       cu2.first_name as customer_first_name
from stage_3_customers as cu1
    join customers as cu2
        on cu1.customer_id::numeric = cu2.customer_id
where cu1.first_name <> cu2.first_name
order by stage_id

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
for row in rows:
        print("---------------------------------------------------------")
        print("Wrong:", row[1], "soundex", soundex(row[1]))
        print("Right:", row[2], "soundex", soundex(row[2]))
        print("Levenshtein Distance:", levenshtein_distance(row[1], row[2]))
        print("Fuzzy: ratio:", fuzz.ratio(row[1], row[2]))
        print("Fuzzy: partial ratio:", fuzz.partial_ratio(row[1], row[2]))
        print("Fuzzy: token sort ratio:", fuzz.partial_ratio(row[1], row[2]))
        

---------------------------------------------------------
Wrong: Roze soundex R200
Right: Rose soundex R200
Levenshtein Distance: 1
Fuzzy: ratio: 75
Fuzzy: partial ratio: 75
Fuzzy: token sort ratio: 75
---------------------------------------------------------
Wrong: Nory soundex N600
Right: Norry soundex N600
Levenshtein Distance: 1
Fuzzy: ratio: 89
Fuzzy: partial ratio: 75
Fuzzy: token sort ratio: 75
---------------------------------------------------------
Wrong: Kavon soundex K150
Right: Kevon soundex K150
Levenshtein Distance: 1
Fuzzy: ratio: 80
Fuzzy: partial ratio: 80
Fuzzy: token sort ratio: 80
---------------------------------------------------------
Wrong: Cheril soundex C640
Right: Cheryl soundex C640
Levenshtein Distance: 1
Fuzzy: ratio: 83
Fuzzy: partial ratio: 83
Fuzzy: token sort ratio: 83
---------------------------------------------------------
Wrong: Kathirina soundex K365
Right: Katharina soundex K365
Levenshtein Distance: 1
Fuzzy: ratio: 89
Fuzzy: partial ratio: 89
F

In [47]:
connection.rollback()

query = """

select distinct first_name
from customers
order by 1
;

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
first_name_list = []
    
for row in rows:
        first_name_list.append(row[0])
        
print(city_list[:100])

['Aaronsburg', 'Abbeville', 'Abbot', 'Abbotsford', 'Abbott', 'Abbottstown', 'Abbyville', 'Abell', 'Abercrombie', 'Aberdeen', 'Aberdeen Proving Ground', 'Abernathy', 'Abie', 'Abilene', 'Abingdon', 'Abington', 'Abiquiu', 'Abita Springs', 'Abrams', 'Absaraka', 'Absarokee', 'Absecon', 'Acampo', 'Accident', 'Accokeek', 'Accomac', 'Accord', 'Accoville', 'Ace', 'Achille', 'Ackerly', 'Ackerman', 'Ackley', 'Ackworth', 'Acme', 'Acosta', 'Acra', 'Acton', 'Acushnet', 'Acworth', 'Ada', 'Adah', 'Adair', 'Adairsville', 'Adairville', 'Adak', 'Adamant', 'Adams', 'Adamsburg', 'Adams Center', 'Adams Run', 'Adamstown', 'Adamsville', 'Addieville', 'Addington', 'Addis', 'Addison', 'Addy', 'Addyston', 'Adel', 'Adelanto', 'Adell', 'Adelphi', 'Adena', 'Adger', 'Adin', 'Adirondack', 'Adjuntas', 'Adkins', 'Admire', 'Adolphus', 'Adona', 'Adrian', 'Advance', 'Afton', 'Agar', 'Agate', 'Agawam', 'Agency', 'Agenda', 'Ages Brookside', 'Agness', 'Agoura Hills', 'Agra', 'Aguada', 'Aguadilla', 'Agua Dulce', 'Aguanga', 'A

In [53]:
connection.rollback()

query = """

select cu1.stage_id,
       cu1.first_name as stage_first_name,
       cu2.first_name as customer_first_name
from stage_3_customers as cu1
    join customers as cu2
        on cu1.customer_id::numeric = cu2.customer_id
where cu1.first_name <> cu2.first_name
order by stage_id

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
for row in rows:
        print("---------------------------------------------------------")
        print("Wrong:", row[1], "soundex", soundex(row[1]))
        print("Right:", row[2], "soundex", soundex(row[2]))
        print("Levenshtein Distance:", levenshtein_distance(row[1], row[2]))
        print("Fuzzy top 5 choices:", fuzz_process.extract(row[1], first_name_list, limit=5))

---------------------------------------------------------
Wrong: Roze soundex R200
Right: Rose soundex R200
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Roze', 100), ('Rozele', 90), ('Rozella', 90), ('Rozelle', 90), ('Roz', 86)]
---------------------------------------------------------
Wrong: Nory soundex N600
Right: Norry soundex N600
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Norby', 89), ('Normy', 89), ('Norry', 89), ('Connor', 77), ('Eleanor', 77)]
---------------------------------------------------------
Wrong: Kavon soundex K150
Right: Kevon soundex K150
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Von', 90), ('Davon', 80), ('Karon', 80), ('Kevon', 80), ('Kalvin', 73)]
---------------------------------------------------------
Wrong: Cheril soundex C640
Right: Cheryl soundex C640
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Cheri', 91), ('Che', 90), ('Cherilynn', 90), ('Cherilyn', 86), ('Cherie', 83)]
---------------------------------------------------------
Wrong: K

# Lab: Data Cleansing - Dedup (Removing Duplicates)

## Find duplicates in stage_3_customers

In [54]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select cu.customer_id,
       cu.first_name,
       cu.last_name,
       cu.street,
       cu.city,
       cu.state,
       cu.zip,
       cu.closest_store_id,
       cu.distance,
       count(*) number_of_duplicates
from stage_3_customers as cu
group by cu.customer_id, cu.first_name, cu.last_name, cu.street, 
         cu.city, cu.state, cu.zip, cu.closest_store_id, cu.distance
having count(*) > 1
order by customer_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance,number_of_duplicates
0,19226,Christin,Penny,3452 Muir Hill,Dallas,TX,75238,3,8,2
1,23319,Brnaba,Challicombe,22008 Jay Circle,Miami,FL,33142,4,4,2
2,27380,Juliette,Simoncello,485 Hazelcrest Alley,Fort Lauderdale,FL,33312,4,22,3


In [49]:
rollback_before_flag = True
rollback_after_flag = True

query = """

with a as (

    select cu.customer_id,
           cu.first_name,
           cu.last_name,
           cu.street,
           cu.city,
           cu.state,
           cu.zip,
           cu.closest_store_id,
           cu.distance
    from stage_3_customers as cu
    group by cu.customer_id, cu.first_name, cu.last_name, cu.street, 
             cu.city, cu.state, cu.zip, cu.closest_store_id, cu.distance
    having count(*) > 1

    )

select *
from stage_3_customers
where customer_id in (select customer_id from a)
order by stage_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,28,19226,Christin,Penny,3452 Muir Hill,Dallas,TX,75238,3,8
1,29,19226,Christin,Penny,3452 Muir Hill,Dallas,TX,75238,3,8
2,34,23319,Brnaba,Challicombe,22008 Jay Circle,Miami,FL,33142,4,4
3,35,23319,Brnaba,Challicombe,22008 Jay Circle,Miami,FL,33142,4,4
4,42,27380,Juliette,Simoncello,485 Hazelcrest Alley,Fort Lauderdale,FL,33312,4,22
5,43,27380,Juliette,Simoncello,485 Hazelcrest Alley,Fort Lauderdale,FL,33312,4,22
6,44,27380,Juliette,Simoncello,485 Hazelcrest Alley,Fort Lauderdale,FL,33312,4,22


## You try it - Find the duplicate sales 

In [60]:
rollback_before_flag = True
rollback_after_flag = True

query = """

with a as (

    select sa.store_id,
           sa.sale_id,
           sa.customer_id,
           sa.sale_date,
           sa.total_amount    
    from stage_3_sales as sa
    group by sa.store_id, sa.sale_id, sa.customer_id, sa.sale_date, sa.total_amount
    having count(*) > 1

    )

select *
from stage_3_sales
where (store_id, sale_id) in (select store_id, sale_id from a)
order by stage_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,8,1,682902,6782,2021-08-23,24
1,9,1,682902,6782,2021-08-23,24
2,18,2,591161,9189,2021-06-20,84
3,19,2,591161,9189,2021-06-20,84
4,29,3,610298,16932,2021-08-06,24
5,30,3,610298,16932,2021-08-06,24
6,42,4,616401,26882,2021-09-02,144
7,43,4,616401,26882,2021-09-02,144
8,51,5,569424,30184,2021-08-09,36
9,53,5,569424,30184,2021-08-09,36


# Lab: Data Cleansing - Missing Values

## Find missing cities in stage_3_customers

In [50]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stage_3_customers
where city is null
order by customer_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,16,11291,Dasi,Radden,18 Hoepker Court,,WA,98136,2,6
1,25,16932,Betta,Swatland,7060 Merchant Pass,,TX,75210,3,4
2,10,6782,Lynd,Iuorio,4 Thackeray Road,,CA,94596,1,12


## You try it - Find the missing customer first names  

In [61]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stage_3_customers
where first_name is null
order by customer_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,21,15481,,Ansill,57 Fordem Circle,Dallas,TX,75201,3,1
1,24,16476,,Lamyman,3078 Emmet Junction,Dallas,TX,75206,3,3
2,32,22673,,Delepine,117 Hauk Trail,Miami,FL,33137,4,3
3,51,29285,,McCreadie,30 Gulseth Terrace,Nashville,TN,37215,5,6


# Lab: Data Cleansing - Outliers

## Find outliers on total_amount in state_3_sales

In [51]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stage_3_sales
where total_amount::numeric > 100
order by store_id, sale_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,4,1,602087,4198,2021-06-09,144
1,13,2,541954,11291,2021-04-27,132
2,20,2,636597,12493,2021-08-08,132
3,25,3,564698,15481,2021-06-11,108
4,28,3,597247,15799,2021-07-22,424
5,32,3,619752,19226,2021-08-16,144
6,35,4,544850,25267,2021-05-31,108
7,36,4,548504,25339,2021-06-05,120
8,42,4,616401,26882,2021-09-02,144
9,43,4,616401,26882,2021-09-02,144


In [62]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stage_3_sales
where total_amount::numeric > 200
order by store_id, sale_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,28,3,597247,15799,2021-07-22,424


## You try it - Find the outliers for line item quantity

In [64]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stage_3_line_items
where quantity::numeric > 5
order by stage_id, store_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,26,1,681140,1,1,10
1,44,2,548317,1,6,23
