# Data Wrangling 2.1 Solutions

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import csv

from datetime import datetime as dt


from IPython.display import display, HTML



In [2]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [3]:
cursor = connection.cursor()

In [4]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

## You try it - extract our temp database tables into the customer nested json file, temp_customers_nested.json, which we looked at last week; some hints:

* customer:
 * create a derived column first_last_name 
 * create a derived column last_first_name
 * join the temp_customers table to the zip_codes table to pull the population, area, and density
* sale:
 * receipt_number is a derived column made up of store_id and sale_id
 * since we don't have sales tax on food, sub_total is the total_amount and tax is 0
* line items:
 * 12 is the price since all meals cost $12 tax included
 * 12 * quantity is the line_total

In [5]:
def my_extract_customers_nested_json(file_name):
    "extract nested json with customers at the top level to the file"
    
    connection.rollback()
    
    file_json = {"creator": "Acme Gourmet Meals",
                  "timestamp": dt.now().strftime("%Y-%d-%m %H:%M:%S"),
                  "file_name": file_name, 
                  "version": "12.4.7",
                  "legal": "Unauthorized use, duplication, or possession, blah, blah",
                  "customers": []
                }
        
    query = """

    select row_to_json(a)
    from (select cu.customer_id,
                 cu.first_name,
                 cu.last_name,
                 (cu.first_name || ' ' || cu.last_name) as first_last_name,
                 (cu.last_name || ', ' || cu.first_name) as last_first_name,
                 cu.street,
                 cu.city,
                 cu.state,
                 cu.zip,
                 z.population,
                 z.area,
                 z.density,
                 cu.closest_store_id,
                 cu.distance
          from temp_customers as cu
               join zip_codes as z
                   on z.zip = cu.zip
          order by cu.customer_id
          ) as a

    """    
    
    cursor.execute(query)
    
    connection.rollback()
        
    customer_rows = cursor.fetchall()
    
    
    
    for customer_row in customer_rows:
        
        customer_json = customer_row[0]
        customer_id = customer_json['customer_id']
        
        query = """

            select row_to_json(a)
            from (select sa.store_id,
                         sa.sale_id,
                         sa.customer_id,
                         (lpad(sa.store_id::text, 3, '0') || '-' || lpad(sa.sale_id::text, 9, '0')) as receipt_number,
                         sa.sale_date,
                         sa.total_amount as sub_total,
                         0 as tax,
                         sa.total_amount
                  from temp_sales as sa
                  where sa.customer_id = %s
                  order by store_id, sale_id
                  ) as a

        """
        
        cursor.execute(query, (customer_id,))
    
        connection.rollback()

        sale_rows = cursor.fetchall()

        sale_list_json = []

        for sale in sale_rows:

            sale_json = sale[0]
            store_id = sale_json['store_id']
            sale_id = sale_json['sale_id']
            customer_id = sale_json['customer_id']

            del sale_json['store_id']
            del sale_json['sale_id']
            del sale_json['customer_id']


            query = """

                select row_to_json(a)
                from (select *
                      from temp_stores
                      where store_id = %s
                      ) as a

            """

            cursor.execute(query, (store_id,))

            connection.rollback()

            store_row = cursor.fetchone()

            store_json = store_row[0]

            sale_json['store'] = store_json
            
            line_item_list_json = []

            query = """

                select row_to_json(a)
                from (select l.product_id,
                             p.description,
                             12 as price,
                             l.quantity,
                             12 * l.quantity as line_total
                      from temp_line_items as l
                           join products as p
                               on l.product_id = p.product_id
                      where store_id = %s and sale_id = %s 
                      order by store_id, sale_id, line_item_id
                      ) as a

            """

            cursor.execute(query, (store_id, sale_id))

            connection.rollback()

            line_item_rows = cursor.fetchall()

            line_item_list_json = []

            for line_item_row in line_item_rows:

                line_item_json = line_item_row[0]

                line_item_list_json.append(line_item_json)

            sale_json['line_items'] = line_item_list_json

            sale_list_json.append(sale_json)
            
        customer_json['sales'] = sale_list_json
        
        file_json['customers'].append(customer_json)
        
    f = open(file_name, "w")
    
    json.dump(file_json, f, indent=2)

    f.close()   
        
    return        

In [6]:
my_extract_customers_nested_json("temp_customers_nested_2.json")

In [7]:
def my_recursive_print_json(j, level = -1):
    "given a json object print it"
    
    level += 1
    
    spaces = "    "
    
    if type(j) is dict:
        dict_2_list = list(j.keys())
        for k in dict_2_list:
            print(spaces * level + k)
            my_recursive_print_json(j[k], level)
            
    elif type(j) is list:
        for (i, l) in enumerate(j):
            print(spaces * level + "[" + str(i) + "]")
            my_recursive_print_json(l, level)
                  
    else:
        print(spaces * level + "value:", str(j))
                  


In [8]:
def my_read_nested_json(file_name):
    "given a file of json, read it and parse it meaningfully"
    
    f = open(file_name, "r")
    
    j = json.load(f)
    
    f.close
    
    my_recursive_print_json(j)

In [9]:
my_read_nested_json("temp_customers_nested_2.json")

creator
    value: Acme Gourmet Meals
timestamp
    value: 2025-12-02 23:09:02
file_name
    value: temp_customers_nested_2.json
version
    value: 12.4.7
legal
    value: Unauthorized use, duplication, or possession, blah, blah
customers
    [0]
        customer_id
            value: 563
        first_name
            value: Rose
        last_name
            value: Slimings
        first_last_name
            value: Rose Slimings
        last_first_name
            value: Slimings, Rose
        street
            value: 38 Iowa Street
        city
            value: Berkeley
        state
            value: CA
        zip
            value: 94704
        population
            value: 29190
        area
            value: 1.2177
        density
            value: 23972.16
        closest_store_id
            value: 1
        distance
            value: 1
        sales
            [0]
                receipt_number
                    value: 001-000255285
                sale_date
    

            value: 12076 Lotheville Circle
        city
            value: Port Orchard
        state
            value: WA
        zip
            value: 98366
        population
            value: 33618
        area
            value: 22.732
        density
            value: 1478.88
        closest_store_id
            value: 2
        distance
            value: 13
        sales
            [0]
                receipt_number
                    value: 002-000119996
                sale_date
                    value: 2020-05-08
                sub_total
                    value: 36
                tax
                    value: 0
                total_amount
                    value: 36
                store
                    store_id
                        value: 2
                    street
                        value: 1001 Broadway
                    city
                        value: Seattle
                    state
                        value: WA
                  

                        value: 5
                    street
                        value: 1202 Broadway
                    city
                        value: Nashville
                    state
                        value: TN
                    zip
                        value: 37203
                    latitude
                        value: 36.1568
                    longitude
                        value: -86.7881
                line_items
                    [0]
                        product_id
                            value: 4
                        description
                            value: Eggplant Lasagna
                        price
                            value: 12
                        quantity
                            value: 1
                        line_total
                            value: 12
                    [1]
                        product_id
                            value: 6
                        description
                

## You try it - load the stage_1_sales table from the csv file clean_sales.csv;  load the stage_1_line_items table from the csv file clean_line_items.csv; verify each with a query

In [10]:
connection.rollback()

query = """

copy stage_1_sales (store_id, sale_id, customer_id, sale_date, total_amount)
from '/user/labs/week_07/clean_data/clean_sales.csv' delimiter ',' NULL '' csv header;

copy stage_1_line_items (store_id, sale_id, line_item_id, product_id, quantity)
from '/user/labs/week_07/clean_data/clean_line_items.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)

connection.commit()

In [11]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_1_sales
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,1,1,128112,3491,2020-04-30,24
1,2,1,144249,1597,2020-05-16,84
2,3,1,163141,4159,2020-06-04,96
3,4,1,169216,4198,2020-06-09,144
4,5,1,179181,5394,2020-06-18,48
...,...,...,...,...,...,...
145,146,5,126722,28750,2020-07-26,36
146,147,5,136553,30184,2020-08-09,36
147,148,5,136735,27728,2020-08-10,96
148,149,5,147541,27654,2020-08-27,96


In [12]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_1_line_items
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,1,1,128112,1,1,1
1,2,1,128112,2,8,1
2,3,1,144249,1,1,1
3,4,1,144249,2,2,1
4,5,1,144249,3,4,2
...,...,...,...,...,...,...
347,348,5,147541,4,4,1
348,349,5,147541,5,7,1
349,350,5,157919,1,1,1
350,351,5,157919,2,4,2


## You try it - using the above function my_explore_staging_table(), explore the tables stage_1_sales and stage_1_line_items

In [13]:
def my_explore_staging_table(table_name):
    "given a table name, explore it"
    
    print("\n---------------------------------------------------")
    print("Exploring Columns for Table:", table_name)
    print("---------------------------------------------------\n")
    
    rollback_before_flag = True
    rollback_after_flag = True
    
    connection.rollback()
    
    query = "select * from " + table_name + " where 0 = 1;"
    
    cursor.execute(query)

    connection.rollback()    

    column_list = [d[0] for d in cursor.description]
    
    for column_name in column_list:
        
        if column_name == "stage_id":
            continue;
        
        print("---------------------------------------------------")
        print("Column:", column_name)
        print("---------------------------------------------------")

        
        query = "select min(length(" + column_name + ")) as min_length, "
        query += " max(length(" + column_name + ")) as max_length, "
        query += " count(*) as total_rows, "
        query += " count(distinct " + column_name + ") as total_distinct_values"
        query += " from " + table_name + ";"
        
        df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
        
        display(HTML(df.to_html()))
        
        query = "select " + column_name + ", count(*) from " + table_name 
        query += " group by " + column_name + " order by 2 desc limit 10;"
        
        df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
        
        display(HTML(df.to_html()))
    

In [14]:
my_explore_staging_table("stage_1_sales")


---------------------------------------------------
Exploring Columns for Table: stage_1_sales
---------------------------------------------------

---------------------------------------------------
Column: store_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,150,5


Unnamed: 0,store_id,count
0,4,30
1,5,30
2,3,30
3,2,30
4,1,30


---------------------------------------------------
Column: sale_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,5,6,150,50


Unnamed: 0,sale_id,count
0,126722,3
1,107715,3
2,88989,3
3,143587,3
4,177427,3
5,144249,3
6,115912,3
7,209184,3
8,147541,3
9,144107,3


---------------------------------------------------
Column: customer_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,3,5,150,50


Unnamed: 0,customer_id,count
0,4260,3
1,25339,3
2,27654,3
3,3491,3
4,15481,3
5,29783,3
6,27380,3
7,27728,3
8,30184,3
9,11779,3


---------------------------------------------------
Column: sale_date
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,10,10,150,44


Unnamed: 0,sale_date,count
0,2020-08-23,9
1,2020-06-20,9
2,2020-06-18,6
3,2020-04-30,6
4,2020-09-16,3
5,2020-07-14,3
6,2020-08-29,3
7,2020-04-23,3
8,2020-04-27,3
9,2020-07-22,3


---------------------------------------------------
Column: total_amount
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,2,3,150,11


Unnamed: 0,total_amount,count
0,48,30
1,36,24
2,24,24
3,96,18
4,84,9
5,144,9
6,60,9
7,108,9
8,132,6
9,12,6


In [15]:
my_explore_staging_table("stage_1_line_items")


---------------------------------------------------
Exploring Columns for Table: stage_1_line_items
---------------------------------------------------

---------------------------------------------------
Column: store_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,352,5


Unnamed: 0,store_id,count
0,4,88
1,1,70
2,2,68
3,5,64
4,3,62


---------------------------------------------------
Column: sale_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,5,6,352,50


Unnamed: 0,sale_id,count
0,183530,14
1,203726,12
2,169216,12
3,144107,12
4,136735,10
5,147541,10
6,109305,10
7,144249,10
8,109083,10
9,111979,10


---------------------------------------------------
Column: line_item_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,352,7


Unnamed: 0,line_item_id,count
0,1,100
1,2,94
2,3,70
3,4,46
4,5,32
5,6,8
6,7,2


---------------------------------------------------
Column: product_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,352,8


Unnamed: 0,product_id,count
0,1,66
1,6,62
2,4,60
3,2,44
4,8,38
5,7,34
6,3,32
7,5,16


---------------------------------------------------
Column: quantity
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,352,4


Unnamed: 0,quantity,count
0,1,216
1,2,92
2,3,34
3,4,10


## You try it - copy data from stage_1_sales to stage_2_sales and from stage_1_line_items to stage_2_line_items; query to verify

In [16]:
#
# create staging tables with all varchar(100)
#

connection.rollback()

query = """

insert into stage_2_sales
(stage_id, store_id, sale_id, customer_id, sale_date, total_amount)
select stage_id,
       store_id::numeric,
       sale_id::numeric,
       customer_id::numeric,
       sale_date::date,
       total_amount::numeric
from stage_1_sales
order by stage_id;

insert into stage_2_line_items
(stage_id, store_id, sale_id, line_item_id, product_id, quantity)
select stage_id,
       store_id::numeric,
       sale_id::numeric,
       line_item_id::numeric,
       product_id::numeric,
       quantity::numeric
from stage_1_line_items
order by stage_id;


"""

cursor.execute(query)

connection.commit()



In [17]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_2_sales
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,1,1,128112,3491,2020-04-30,24
1,2,1,144249,1597,2020-05-16,84
2,3,1,163141,4159,2020-06-04,96
3,4,1,169216,4198,2020-06-09,144
4,5,1,179181,5394,2020-06-18,48
...,...,...,...,...,...,...
145,146,5,126722,28750,2020-07-26,36
146,147,5,136553,30184,2020-08-09,36
147,148,5,136735,27728,2020-08-10,96
148,149,5,147541,27654,2020-08-27,96


In [18]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_2_line_items
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,1,1,128112,1,1,1
1,2,1,128112,2,8,1
2,3,1,144249,1,1,1
3,4,1,144249,2,2,1
4,5,1,144249,3,4,2
...,...,...,...,...,...,...
171,172,5,147541,4,4,1
172,173,5,147541,5,7,1
173,174,5,157919,1,1,1
174,175,5,157919,2,4,2
