# Data Wrangling 2.2 Solutions

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import csv

from datetime import datetime as dt

from IPython.display import display, HTML


from jellyfish import soundex, levenshtein_distance

from fuzzywuzzy import fuzz

from fuzzywuzzy import process as fuzz_process


In [2]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [3]:
cursor = connection.cursor()

In [4]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

## You try it - Repeat for bad customer first names;

In [5]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select cu1.stage_id,
       cu1.first_name as stage_first_name,
       cu2.first_name as customer_first_name
from stage_3_customers as cu1
     join customers as cu2
         on cu1.customer_id::numeric = cu2.customer_id
where cu1.first_name <> cu2.first_name
order by cu1.stage_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,stage_first_name,customer_first_name
0,1,Roze,Rose
1,2,Nory,Norry
2,4,Kavon,Kevon
3,6,Cheril,Cheryl
4,9,Kathirina,Katharina
5,10,Lynd,Lyndsay
6,14,Marcus,Markos
7,15,Deby,Debby
8,22,Thed,Ted
9,23,Zachariah,Zackariah


In [6]:

connection.rollback()

query = """

select cu1.stage_id,
       cu1.first_name as stage_first_name,
       cu2.first_name as customer_first_name
from stage_3_customers as cu1
     join customers as cu2
         on cu1.customer_id::numeric = cu2.customer_id
where cu1.first_name <> cu2.first_name
order by cu1.stage_id
;

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
for row in rows:
        print("---------------------------------------------------------")
        print("Wrong:", row[1], "soundex", soundex(row[1]))
        print("Right:", row[2], "soundex", soundex(row[2]))
        print("Levenshtein Distance:", levenshtein_distance(row[1], row[2]))
        print("Fuzzy: ratio:", fuzz.ratio(row[1], row[2]))
        print("Fuzzy: partial ratio:", fuzz.partial_ratio(row[1], row[2]))
        print("Fuzzy: token sort ratio:", fuzz.partial_ratio(row[1], row[2]))
        

---------------------------------------------------------
Wrong: Roze soundex R200
Right: Rose soundex R200
Levenshtein Distance: 1
Fuzzy: ratio: 75
Fuzzy: partial ratio: 75
Fuzzy: token sort ratio: 75
---------------------------------------------------------
Wrong: Nory soundex N600
Right: Norry soundex N600
Levenshtein Distance: 1
Fuzzy: ratio: 89
Fuzzy: partial ratio: 75
Fuzzy: token sort ratio: 75
---------------------------------------------------------
Wrong: Kavon soundex K150
Right: Kevon soundex K150
Levenshtein Distance: 1
Fuzzy: ratio: 80
Fuzzy: partial ratio: 80
Fuzzy: token sort ratio: 80
---------------------------------------------------------
Wrong: Cheril soundex C640
Right: Cheryl soundex C640
Levenshtein Distance: 1
Fuzzy: ratio: 83
Fuzzy: partial ratio: 83
Fuzzy: token sort ratio: 83
---------------------------------------------------------
Wrong: Kathirina soundex K365
Right: Katharina soundex K365
Levenshtein Distance: 1
Fuzzy: ratio: 89
Fuzzy: partial ratio: 89
F

In [7]:

connection.rollback()

query = """

select distinct first_name
from customers
order by 1
;

"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
first_name_list = []
    
for row in rows:
        first_name_list.append(row[0])
        
print(first_name_list[:100])

['Aaren', 'Aarika', 'Aaron', 'Ab', 'Abagael', 'Abagail', 'Abba', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abbot', 'Abby', 'Abbye', 'Abdel', 'Abe', 'Abel', 'Abelard', 'Abeu', 'Abey', 'Abie', 'Abigael', 'Abigail', 'Abigale', 'Abner', 'Abra', 'Abraham', 'Abrahan', 'Abram', 'Abramo', 'Abran', 'Ad', 'Ada', 'Adah', 'Adair', 'Adaline', 'Adam', 'Adamo', 'Adams', 'Adan', 'Adara', 'Adda', 'Addi', 'Addia', 'Addie', 'Addison', 'Addy', 'Ade', 'Adel', 'Adela', 'Adelaida', 'Adelaide', 'Adelbert', 'Adele', 'Adelheid', 'Adelice', 'Adelina', 'Adeline', 'Adella', 'Adelle', 'Adena', 'Adey', 'Adham', 'Adi', 'Adiana', 'Adina', 'Adlai', 'Adler', 'Ado', 'Adolf', 'Adolph', 'Adolphe', 'Adolpho', 'Adolphus', 'Adora', 'Adore', 'Adoree', 'Adorne', 'Adrea', 'Adria', 'Adriaens', 'Adrian', 'Adriana', 'Adriane', 'Adrianna', 'Adrianne', 'Adriano', 'Adrien', 'Adriena', 'Adrienne', 'Aeriel', 'Aeriela', 'Aeriell', 'Afton', 'Ag', 'Agace', 'Agata', 'Agatha', 'Agathe', 'Aggi']


In [8]:

connection.rollback()

query = """

select cu1.stage_id,
       cu1.first_name as stage_first_name,
       cu2.first_name as customer_first_name
from stage_3_customers as cu1
     join customers as cu2
         on cu1.customer_id::numeric = cu2.customer_id
where cu1.first_name <> cu2.first_name
order by cu1.stage_id
;


"""
    
cursor.execute(query)

connection.rollback()
    
rows = cursor.fetchall()
    
for row in rows:
        print("---------------------------------------------------------")
        print("Wrong:", row[1], "soundex", soundex(row[1]))
        print("Right:", row[2], "soundex", soundex(row[2]))
        print("Levenshtein Distance:", levenshtein_distance(row[1], row[2]))
        print("Fuzzy top 5 choices:", fuzz_process.extract(row[1], first_name_list, limit=5))
        

---------------------------------------------------------
Wrong: Roze soundex R200
Right: Rose soundex R200
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Roze', 100), ('Rozele', 90), ('Rozella', 90), ('Rozelle', 90), ('Roz', 86)]
---------------------------------------------------------
Wrong: Nory soundex N600
Right: Norry soundex N600
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Norby', 89), ('Normy', 89), ('Norry', 89), ('Connor', 77), ('Eleanor', 77)]
---------------------------------------------------------
Wrong: Kavon soundex K150
Right: Kevon soundex K150
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Von', 90), ('Davon', 80), ('Karon', 80), ('Kevon', 80), ('Kalvin', 73)]
---------------------------------------------------------
Wrong: Cheril soundex C640
Right: Cheryl soundex C640
Levenshtein Distance: 1
Fuzzy top 5 choices: [('Cheri', 91), ('Che', 90), ('Cherilynn', 90), ('Cherilyn', 86), ('Cherie', 83)]
---------------------------------------------------------
Wrong: K

## You try it - Find the duplicate sales 

In [9]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select sa.store_id,
       sa.sale_id,
       sa.customer_id,
       sa.sale_date,
       sa.total_amount,
       count(*) number_of_duplicates
from stage_3_sales as sa
group by sa.store_id, sa.sale_id, sa.customer_id, sa.sale_date, sa.total_amount
having count(*) > 1
order by store_id, sale_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,store_id,sale_id,customer_id,sale_date,total_amount,number_of_duplicates
0,1,682902,6782,2021-08-23,24,2
1,2,591161,9189,2021-06-20,84,2
2,3,610298,16932,2021-08-06,24,2
3,4,616401,26882,2021-09-02,144,2
4,5,569424,30184,2021-08-09,36,3


In [10]:
rollback_before_flag = True
rollback_after_flag = True

query = """

with a as (

        select sa.store_id,
               sa.sale_id,
               sa.customer_id,
               sa.sale_date,
               sa.total_amount
        from stage_3_sales as sa
        group by sa.store_id, sa.sale_id, sa.customer_id, sa.sale_date, sa.total_amount
        having count(*) > 1

    )

select *
from stage_3_sales
where (store_id, sale_id) in (select store_id, sale_id from a)
order by stage_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,8,1,682902,6782,2021-08-23,24
1,9,1,682902,6782,2021-08-23,24
2,18,2,591161,9189,2021-06-20,84
3,19,2,591161,9189,2021-06-20,84
4,29,3,610298,16932,2021-08-06,24
5,30,3,610298,16932,2021-08-06,24
6,42,4,616401,26882,2021-09-02,144
7,43,4,616401,26882,2021-09-02,144
8,51,5,569424,30184,2021-08-09,36
9,53,5,569424,30184,2021-08-09,36


## You try it - Find the missing customer first names  

In [11]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stage_3_customers
where first_name is null
order by customer_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,21,15481,,Ansill,57 Fordem Circle,Dallas,TX,75201,3,1
1,24,16476,,Lamyman,3078 Emmet Junction,Dallas,TX,75206,3,3
2,32,22673,,Delepine,117 Hauk Trail,Miami,FL,33137,4,3
3,51,29285,,McCreadie,30 Gulseth Terrace,Nashville,TN,37215,5,6


## You try it - Find the outliers for line item quantity

In [12]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stage_3_line_items
where quantity::numeric >= 10
order by store_id, sale_id, line_item_id
;


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,26,1,681140,1,1,10
1,44,2,548317,1,6,23
