# Data Wrangling 2.1

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import csv

from datetime import datetime as dt

from IPython.display import display, HTML


In [2]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [3]:
cursor = connection.cursor()

In [4]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

# Lab: Data Extraction

## So far, we have done several data extractions:

* extracting database tables to csv format
* extracting database tables to flat json format
* extracting flat json files to csv format
* extracting database tables to nested json format
* extracting nested json files to csv format
* extracting database tables to Excel workbooks with several work sheets
* extracting Excel workbooks with several work sheets to csv format

## Last week, we extracted our temp database tables into the stores nested json file; we will now extract our temp database tables starting with sales at the top and perform some more complicated transformations as part of the extract

In [5]:
def my_extract_sales_nested_json(file_name):
    "extract nested json with sales at the top level to the file"
    
    connection.rollback()
    
    file_json = {"creator": "Acme Gourmet Meals",
                  "timestamp": dt.now().strftime("%Y-%d-%m %H:%M:%S"),
                  "file_name": file_name, 
                  "version": "12.4.7",
                  "legal": "Unauthorized use, duplication, or possession, blah, blah",
                  "sales": []
                }
        
    query = """
    
    select row_to_json(a)
    from (select store_id,
                 sale_id,
                 customer_id,
                 (lpad(store_id::text, 3, '0') || '-' || lpad(sale_id::text, 9, '0')) as receipt_number,
                 sale_date,
                 total_amount as sub_total,
                 0 as tax,
                 total_amount
          from temp_sales
          order by store_id, sale_id
          ) as a
           
    """
    
    cursor.execute(query)
    
    connection.rollback()

    sale_rows = cursor.fetchall()
    
    sale_list_json = []
    
    for sale in sale_rows:
        
        sale_json = sale[0]
        store_id = sale_json['store_id']
        sale_id = sale_json['sale_id']
        customer_id = sale_json['customer_id']
        
        del sale_json['store_id']
        del sale_json['sale_id']
        del sale_json['customer_id']
                
            
        query = """

            select row_to_json(a)
            from (select *
                  from temp_stores
                  where store_id = %s
                  ) as a

        """
            
        cursor.execute(query, (store_id,))
    
        connection.rollback()
        
        store_row = cursor.fetchone()
        
        store_json = store_row[0]
        
        sale_json['store'] = store_json
            
            
        query = """

            select row_to_json(a)
            from (select customer_id,
                         cu.first_name,
                         cu.last_name,
                         (cu.first_name || ' ' || cu.last_name) as first_last_name,
                         (cu.last_name || ', ' || cu.first_name) as last_first_name,
                         cu.street,
                         cu.city,
                         cu.state,
                         cu.zip,
                         z.population,
                         z.area,
                         z.density,
                         cu.closest_store_id,
                         cu.distance
                  from temp_customers as cu
                       join zip_codes as z
                           on z.zip = cu.zip
                  where customer_id = %s
                  ) as a

        """
            
        cursor.execute(query, (customer_id,))
    
        connection.rollback()
        
        customer_row = cursor.fetchone()
        
        customer_json = customer_row[0]
        
        sale_json['customer'] = customer_json
        
        
        
        line_item_list_json = []
            
        query = """

            select row_to_json(a)
            from (select l.product_id,
                         p.description,
                         12 as price,
                         l.quantity,
                         12 * l.quantity as line_total
                  from temp_line_items as l
                       join products as p
                           on l.product_id = p.product_id
                  where store_id = %s and sale_id = %s 
                  order by store_id, sale_id, line_item_id
                  ) as a

        """

        cursor.execute(query, (store_id, sale_id))
    
        connection.rollback()
        
        line_item_rows = cursor.fetchall()
        
        line_item_list_json = []
            
        for line_item_row in line_item_rows:
                
            line_item_json = line_item_row[0]
            
            line_item_list_json.append(line_item_json)
            
        sale_json['line_items'] = line_item_list_json
             
    
        file_json['sales'].append(sale_json)
    
        
    f = open(file_name, "w")
    
    json.dump(file_json, f, indent=2)

    f.close()  

In [6]:
my_extract_sales_nested_json("temp_sales_nested.json")

## Use recursion to print nested json to show the structure

In [7]:
def my_recursive_print_json(j, level = -1):
    "given a json object print it"
    
    level += 1
    
    spaces = "    "
    
    if type(j) is dict:
        dict_2_list = list(j.keys())
        for k in dict_2_list:
            print(spaces * level + k)
            my_recursive_print_json(j[k], level)
            
    elif type(j) is list:
        for (i, l) in enumerate(j):
            print(spaces * level + "[" + str(i) + "]")
            my_recursive_print_json(l, level)
                  
    else:
        print(spaces * level + "value:", str(j))
                  


In [8]:
def my_read_nested_json(file_name):
    "given a file of json, read it and parse it meaningfully"
    
    f = open(file_name, "r")
    
    j = json.load(f)
    
    f.close
    
    my_recursive_print_json(j)

In [9]:
my_read_nested_json("temp_sales_nested.json")

creator
    value: Acme Gourmet Meals
timestamp
    value: 2025-13-02 00:12:52
file_name
    value: temp_sales_nested.json
version
    value: 12.4.7
legal
    value: Unauthorized use, duplication, or possession, blah, blah
sales
    [0]
        receipt_number
            value: 001-000128112
        sale_date
            value: 2020-04-30
        sub_total
            value: 24
        tax
            value: 0
        total_amount
            value: 24
        store
            store_id
                value: 1
            street
                value: 3000 Telegraph Ave
            city
                value: Berkeley
            state
                value: CA
            zip
                value: 94705
            latitude
                value: 37.8555
            longitude
                value: -122.2604
        customer
            customer_id
                value: 3491
            first_name
                value: Siouxie
            last_name
                value: M'Quharge

                value: 75201
            latitude
                value: 32.7958
            longitude
                value: -96.8015
        customer
            customer_id
                value: 15799
            first_name
                value: Zackariah
            last_name
                value: McCreath
            first_last_name
                value: Zackariah McCreath
            last_first_name
                value: McCreath, Zackariah
            street
                value: 64828 Farwell Terrace
            city
                value: Dallas
            state
                value: TX
            zip
                value: 75219
            population
                value: 23419
            area
                value: 2.32
            density
                value: 10094.48
            closest_store_id
                value: 3
            distance
                value: 1
        line_items
            [0]
                product_id
                    value: 2
    

                    value: 12
                quantity
                    value: 1
                line_total
                    value: 12
            [4]
                product_id
                    value: 7
                description
                    value: Tilapia Piccata
                price
                    value: 12
                quantity
                    value: 1
                line_total
                    value: 12
    [48]
        receipt_number
            value: 005-000147541
        sale_date
            value: 2020-08-27
        sub_total
            value: 96
        tax
            value: 0
        total_amount
            value: 96
        store
            store_id
                value: 5
            street
                value: 1202 Broadway
            city
                value: Nashville
            state
                value: TN
            zip
                value: 37203
            latitude
                value: 36.1568
            longi

## You try it - extract our temp database tables into the customer nested json file, temp_customers_nested.json, which we looked at last week; some hints:

* customer:
 * create a derived column first_last_name 
 * create a derived column last_first_name
 * join the temp_customers table to the zip_codes table to pull the population, area, and density
* sale:
 * receipt_number is a derived column made up of store_id and sale_id
 * since we don't have sales tax on food, sub_total is the total_amount and tax is 0
* line items:
 * 12 is the price since all meals cost $12 tax included
 * 12 * quantity is the line_total

In [10]:
def my_extract_customers_nested_json(file_name):
    "extract nested json with customers at the top level to the file"
    
    connection.rollback()
    
    file_json = {"creator": "Acme Gourmet Meals",
                  "timestamp": dt.now().strftime("%Y-%d-%m %H:%M:%S"),
                  "file_name": file_name, 
                  "version": "12.4.7",
                  "legal": "Unauthorized use, duplication, or possession, blah, blah",
                  "customers": []
                }
        
    query = """
    
    select row_to_json(a)
    from (select cu.customer_id,
                 cu.first_name,
                 cu.last_name,
                 (cu.first_name || ' ' || cu.last_name) as first_last_name,
                 (cu.last_name || ', ' || cu.first_name) as last_first_name,
                 cu.street,
                 cu.city,
                 cu.state,
                 cu.zip,
                 z.population,
                 z.area,
                 z.density,
                 cu.closest_store_id,
                 cu.distance                 
          from temp_customers as cu
               join zip_codes as z
                   on cu.zip = z.zip
          order by cu.customer_id
          ) as a
           
    """
    
    cursor.execute(query)
    
    connection.rollback()
        
    customer_rows = cursor.fetchall()
    
    
    
    for customer_row in customer_rows:
        
        customer_json = customer_row[0]
        customer_id = customer_json['customer_id']
        
        query = """

            select row_to_json(a)
            from (select sa.store_id,
                         sa.sale_id,
                         sa.customer_id,
                         (lpad(sa.store_id::text, 3, '0') || '-' || lpad(sa.sale_id::text, 9, '0')) as receipt_number,
                         sa.sale_date,
                         sa.total_amount as sub_total,
                         0 as tax,
                         sa.total_amount
                  from temp_sales as sa
                  where sa.customer_id = %s
                  order by store_id, sale_id
                  ) as a

        """
        
        cursor.execute(query, (customer_id,))

        connection.rollback()

        sale_rows = cursor.fetchall()

        sale_list_json = []
    
        for sale in sale_rows:

            sale_json = sale[0]
            store_id = sale_json['store_id']
            sale_id = sale_json['sale_id']
            customer_id = sale_json['customer_id']

            del sale_json['store_id']
            del sale_json['sale_id']
            del sale_json['customer_id']


            query = """

                select row_to_json(a)
                from (select *
                      from temp_stores
                      where store_id = %s
                      ) as a

            """

            cursor.execute(query, (store_id,))

            connection.rollback()

            store_row = cursor.fetchone()

            store_json = store_row[0]

            sale_json['store'] = store_json

            line_item_list_jason = []

            query = """

                select row_to_json(a)
                from (select l.product_id,
                             p.description,
                             12 as price,
                             l.quantity,
                             12 * l.quantity as line_total
                      from temp_line_items as l
                           join products as p
                               on l.product_id = p.product_id
                      where store_id = %s and sale_id = %s 
                      order by store_id, sale_id, line_item_id
                      ) as a

            """

            cursor.execute(query, (store_id, sale_id))

            connection.rollback()

            line_item_rows = cursor.fetchall()

            line_item_list_json = []

            for line_item_row in line_item_rows:

                line_item_json = line_item_row[0]

                line_item_list_json.append(line_item_json)

            sale_json['line_items'] = line_item_list_json
            sale_list_json.append(sale_json)
            
        customer_json['sales'] = sale_list_json
        
        file_json['customers'].append(customer_json)
        
    f = open(file_name, "w")
    
    json.dump(file_json, f, indent=2)

    f.close()  
    
    return

In [11]:
my_extract_customers_nested_json("temp_customers_nested.json")

In [12]:
my_read_nested_json("temp_customers_nested.json")

creator
    value: Acme Gourmet Meals
timestamp
    value: 2025-13-02 00:12:52
file_name
    value: temp_customers_nested.json
version
    value: 12.4.7
legal
    value: Unauthorized use, duplication, or possession, blah, blah
customers
    [0]
        customer_id
            value: 563
        first_name
            value: Rose
        last_name
            value: Slimings
        first_last_name
            value: Rose Slimings
        last_first_name
            value: Slimings, Rose
        street
            value: 38 Iowa Street
        city
            value: Berkeley
        state
            value: CA
        zip
            value: 94704
        population
            value: 29190
        area
            value: 1.2177
        density
            value: 23972.16
        closest_store_id
            value: 1
        distance
            value: 1
        sales
            [0]
                receipt_number
                    value: 001-000255285
                sale_date
      

            value: 2132.6
        closest_store_id
            value: 1
        distance
            value: 7
        sales
            [0]
                receipt_number
                    value: 001-000169216
                sale_date
                    value: 2020-06-09
                sub_total
                    value: 144
                tax
                    value: 0
                total_amount
                    value: 144
                store
                    store_id
                        value: 1
                    street
                        value: 3000 Telegraph Ave
                    city
                        value: Berkeley
                    state
                        value: CA
                    zip
                        value: 94705
                    latitude
                        value: 37.8555
                    longitude
                        value: -122.2604
                line_items
                    [0]
                     

            value: Sheffield Dunkerton
        last_first_name
            value: Dunkerton, Sheffield
        street
            value: 11 Quincy Parkway
        city
            value: Dallas
        state
            value: TX
        zip
            value: 75225
        population
            value: 21296
        area
            value: 4.8118
        density
            value: 4425.77
        closest_store_id
            value: 3
        distance
            value: 5
        sales
            [0]
                receipt_number
                    value: 003-000154102
                sale_date
                    value: 2020-07-10
                sub_total
                    value: 24
                tax
                    value: 0
                total_amount
                    value: 24
                store
                    store_id
                        value: 3
                    street
                        value: 2510 McKinney Ave
                    city
        

                    street
                        value: 299 SE 3rd Ave
                    city
                        value: Miami
                    state
                        value: FL
                    zip
                        value: 33131
                    latitude
                        value: 25.772
                    longitude
                        value: -80.1891
                line_items
                    [0]
                        product_id
                            value: 1
                        description
                            value: Pistachio Salmon
                        price
                            value: 12
                        quantity
                            value: 1
                        line_total
                            value: 12
                    [1]
                        product_id
                            value: 2
                        description
                            value: Teriyaki Chicken
 

        area
            value: 9.0824
        density
            value: 1626.0
        closest_store_id
            value: 5
        distance
            value: 4
        sales
            [0]
                receipt_number
                    value: 005-000109305
                sale_date
                    value: 2020-06-27
                sub_total
                    value: 120
                tax
                    value: 0
                total_amount
                    value: 120
                store
                    store_id
                        value: 5
                    street
                        value: 1202 Broadway
                    city
                        value: Nashville
                    state
                        value: TN
                    zip
                        value: 37203
                    latitude
                        value: 36.1568
                    longitude
                        value: -86.7881
                line_i

# Lab: Creating Staging Tables, Loading Raw Data into Staging Tables

In [13]:
connection.rollback()

query = """

drop table if exists stage_1_customers;
drop table if exists stage_1_sales;
drop table if exists stage_1_line_items;



"""

cursor.execute(query)

connection.commit()



## Using varchar(100) for all columns so data out of format will load so we can clean it

In [14]:
#
# create staging tables with all varchar(100)
#

connection.rollback()

query = """


create table stage_1_customers (
  stage_id serial,
  customer_id varchar(100),
  first_name varchar(100),
  last_name varchar(100),
  street varchar(100),
  city varchar(100),
  state varchar(100),
  zip varchar(100),
  closest_store_id varchar(100),
  distance varchar(100)
);

create table stage_1_sales (
  stage_id serial,
  store_id varchar(100),
  sale_id varchar(100),
  customer_id varchar(100),
  sale_date varchar(100),
  total_amount varchar(100)
);

create table stage_1_line_items (
  stage_id serial,
  store_id varchar(100),
  sale_id varchar(100),
  line_item_id varchar(100),
  product_id varchar(100),
  quantity varchar(100)
);

"""

cursor.execute(query)

connection.commit()



In [15]:
connection.rollback()

query = """

copy stage_1_customers (customer_id, first_name, last_name, street, city, state, zip, closest_store_id, distance)
from '/user/labs/week_07/clean_data/clean_customers.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [16]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_1_customers
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,1,563,Rose,Slimings,38 Iowa Street,Berkeley,CA,94704,1,1
1,2,1597,Norry,Macauley,654 Sommers Plaza,Oakland,CA,94612,1,3
2,3,1958,Theresina,Penswick,5975 Twin Pines Hill,Berkeley,CA,94707,1,3
3,4,1991,Kevon,Wickett,472 Arizona Court,Berkeley,CA,94707,1,3
4,5,3491,Siouxie,M'Quharge,747 Westridge Center,Alameda,CA,94501,1,6
5,6,4159,Cheryl,Broe,7 Ruskin Alley,El Sobrante,CA,94803,1,7
6,7,4198,Andreana,Drew,11039 Cordelia Alley,El Sobrante,CA,94803,1,7
7,8,4260,Dom,Risbrough,3 Northland Crossing,Richmond,CA,94805,1,7
8,9,5394,Katharina,Bavester,522 Cordelia Lane,San Francisco,CA,94102,1,10
9,10,6782,Lyndsay,Iuorio,4 Thackeray Road,Walnut Creek,CA,94596,1,12


## You try it - load the stage_1_sales table from the csv file clean_sales.csv;  load the stage_1_line_items table from the csv file clean_line_items.csv; verify each with a query

In [17]:
connection.rollback()

query = """

copy stage_1_sales (store_id,sale_id,customer_id,sale_date,total_amount)
from '/user/labs/week_07/clean_data/clean_sales.csv' delimiter ',' NULL '' csv header;

copy stage_1_line_items (store_id,sale_id,line_item_id,product_id,quantity)
from '/user/labs/week_07/clean_data/clean_line_items.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [18]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_1_sales
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,1,1,128112,3491,2020-04-30,24
1,2,1,144249,1597,2020-05-16,84
2,3,1,163141,4159,2020-06-04,96
3,4,1,169216,4198,2020-06-09,144
4,5,1,179181,5394,2020-06-18,48
5,6,1,181897,1958,2020-06-20,48
6,7,1,248269,4260,2020-08-22,60
7,8,1,250031,6782,2020-08-23,24
8,9,1,255285,563,2020-08-29,36
9,10,1,263524,1991,2020-09-07,48


In [19]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_1_line_items
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,1,1,128112,1,1,1
1,2,1,128112,2,8,1
2,3,1,144249,1,1,1
3,4,1,144249,2,2,1
4,5,1,144249,3,4,2
...,...,...,...,...,...,...
171,172,5,147541,4,4,1
172,173,5,147541,5,7,1
173,174,5,157919,1,1,1
174,175,5,157919,2,4,2


# Lab: Raw Data Exploration Using Staging Tables

## Given a table, query the column names;  for each column, find max length, min length, max value, min value, how distinct is the column; etc;

In [20]:
def my_explore_staging_table(table_name):
    "given a table name, explore it"
    
    print("\n---------------------------------------------------")
    print("Exploring Columns for Table:", table_name)
    print("---------------------------------------------------\n")
    
    rollback_before_flag = True
    rollback_after_flag = True
    
    connection.rollback()
    
    query = "select * from " + table_name + " where 0 = 1;" #provide column list
    
    cursor.execute(query)

    connection.rollback()    

    column_list = [d[0] for d in cursor.description]
    
    for column_name in column_list:
        
        if column_name == "stage_id":
            continue;
        
        print("---------------------------------------------------")
        print("Column:", column_name)
        print("---------------------------------------------------")

        
        query = "select min(length(" + column_name + ")) as min_length, "
        query += " max(length(" + column_name + ")) as max_length, "
        query += " count(*) as total_rows, "
        query += " count(distinct " + column_name + ") as total_distinct_values"
        query += " from " + table_name + ";"
        
        df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
        
        display(HTML(df.to_html()))
        
        query = "select " + column_name + ", count(*) from " + table_name 
        query += " group by " + column_name + " order by 2 desc limit 10;"
        
        df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
        
        display(HTML(df.to_html()))
    

In [21]:
my_explore_staging_table("stage_1_customers")


---------------------------------------------------
Exploring Columns for Table: stage_1_customers
---------------------------------------------------

---------------------------------------------------
Column: customer_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,3,5,50,50


Unnamed: 0,customer_id,count
0,4198,1
1,19494,1
2,29922,1
3,25267,1
4,22673,1
5,9528,1
6,6782,1
7,1597,1
8,11291,1
9,16932,1


---------------------------------------------------
Column: first_name
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,3,9,50,50


Unnamed: 0,first_name,count
0,Frederica,1
1,Markos,1
2,Lucilia,1
3,Juliette,1
4,Norry,1
5,Garwin,1
6,Beth,1
7,Brianne,1
8,Billy,1
9,Sim,1


---------------------------------------------------
Column: last_name
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,4,11,50,50


Unnamed: 0,last_name,count
0,Cabrer,1
1,Campany,1
2,Simoncello,1
3,Ansill,1
4,Lewry,1
5,Bridel,1
6,Dorie,1
7,Lamyman,1
8,Broe,1
9,Morrall,1


---------------------------------------------------
Column: street
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,14,24,50,50


Unnamed: 0,street,count
0,7 Ruskin Alley,1
1,91111 West Road,1
2,11 Quincy Parkway,1
3,987 Kim Terrace,1
4,5975 Twin Pines Hill,1
5,3078 Emmet Junction,1
6,593 American Ash Hill,1
7,172 Bunker Hill Drive,1
8,94261 Sunnyside Junction,1
9,6858 Burning Wood Avenue,1


---------------------------------------------------
Column: city
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,5,15,50,20


Unnamed: 0,city,count
0,Dallas,9
1,Seattle,7
2,Nashville,7
3,Miami,7
4,Berkeley,3
5,Madison,2
6,El Sobrante,2
7,Fort Lauderdale,1
8,Mesquite,1
9,Alameda,1


---------------------------------------------------
Column: state
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,2,2,50,5


Unnamed: 0,state,count
0,CA,10
1,TX,10
2,FL,10
3,TN,10
4,WA,10


---------------------------------------------------
Column: zip
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,5,5,50,41


Unnamed: 0,zip,count
0,33166,2
1,37208,2
2,75219,2
3,98112,2
4,37212,2
5,75210,2
6,94707,2
7,94803,2
8,37115,2
9,75206,1


---------------------------------------------------
Column: closest_store_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,50,5


Unnamed: 0,closest_store_id,count
0,5,10
1,2,10
2,1,10
3,4,10
4,3,10


---------------------------------------------------
Column: distance
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,2,50,15


Unnamed: 0,distance,count
0,2,7
1,4,6
2,3,6
3,6,5
4,8,4
5,7,4
6,1,4
7,9,4
8,5,3
9,10,2


## In the coming labs for this week, we will do further exploration to clean the data as we go

## You try it - using the above function my_explore_staging_table(), explore the tables stage_1_sales and stage_1_line_items

In [22]:
my_explore_staging_table("stage_1_sales")


---------------------------------------------------
Exploring Columns for Table: stage_1_sales
---------------------------------------------------

---------------------------------------------------
Column: store_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,50,5


Unnamed: 0,store_id,count
0,2,10
1,4,10
2,5,10
3,1,10
4,3,10


---------------------------------------------------
Column: sale_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,5,6,50,50


Unnamed: 0,sale_id,count
0,126722,1
1,107715,1
2,88989,1
3,143587,1
4,177427,1
5,144249,1
6,115912,1
7,209184,1
8,147541,1
9,144107,1


---------------------------------------------------
Column: customer_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,3,5,50,50


Unnamed: 0,customer_id,count
0,4260,1
1,25339,1
2,27654,1
3,3491,1
4,15481,1
5,29783,1
6,27380,1
7,27728,1
8,30184,1
9,11779,1


---------------------------------------------------
Column: sale_date
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,10,10,50,44


Unnamed: 0,sale_date,count
0,2020-08-23,3
1,2020-06-20,3
2,2020-06-18,2
3,2020-04-30,2
4,2020-09-16,1
5,2020-07-14,1
6,2020-08-29,1
7,2020-04-23,1
8,2020-04-27,1
9,2020-07-22,1


---------------------------------------------------
Column: total_amount
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,2,3,50,11


Unnamed: 0,total_amount,count
0,48,10
1,36,8
2,24,8
3,96,6
4,84,3
5,144,3
6,60,3
7,108,3
8,132,2
9,120,2


In [23]:
my_explore_staging_table("stage_1_line_items")


---------------------------------------------------
Exploring Columns for Table: stage_1_line_items
---------------------------------------------------

---------------------------------------------------
Column: store_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,176,5


Unnamed: 0,store_id,count
0,4,44
1,1,35
2,2,34
3,5,32
4,3,31


---------------------------------------------------
Column: sale_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,5,6,176,50


Unnamed: 0,sale_id,count
0,183530,7
1,169216,6
2,144107,6
3,203726,6
4,192337,5
5,144249,5
6,115633,5
7,111979,5
8,147541,5
9,109305,5


---------------------------------------------------
Column: line_item_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,176,7


Unnamed: 0,line_item_id,count
0,1,50
1,2,47
2,3,35
3,4,23
4,5,16
5,6,4
6,7,1


---------------------------------------------------
Column: product_id
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,176,8


Unnamed: 0,product_id,count
0,1,33
1,6,31
2,4,30
3,2,22
4,8,19
5,7,17
6,3,16
7,5,8


---------------------------------------------------
Column: quantity
---------------------------------------------------


Unnamed: 0,min_length,max_length,total_rows,total_distinct_values
0,1,1,176,4


Unnamed: 0,quantity,count
0,1,108
1,2,46
2,3,17
3,4,5


# Lab: Transforming Data by Parsing, Joining, Augmenting, Consolidating, and Filtering

## In our extractions so far, we have seen examples of Parsing, Joining, Augmenting, Consolidating, and Filtering:

## Parsing

* created a receipt_id from store_id and sale_id
* extracting the date to a dow number
* extracting the date to a day of week string
* extracting the date to a month string
* creating first and last name in the same column, also with last name first

## Joining

* joining the stores table to the sales table
* joining the sales table to the customers table
* joining the sales table to the line items table
* joining the line items table to the products table

## Augmenting (joining to a table in a secondary dataset)

* joining the zip in customer table to the secondary dataset table zip_codes to pull the population, area, and density

## Consolidating (Aggregation)

* number of rows in a table
* number of disctince rows in a table
* max value of a column
* min value of a column
* average value of a column

## Filtering

* where clauses (pre-aggregation)
* having clauses (post-aggregation)


## Once data is clean, we can create another staging table with actual data types and copy 

In [24]:
connection.rollback()

query = """

drop table if exists stage_2_customers;
drop table if exists stage_2_sales;
drop table if exists stage_2_line_items;

"""

cursor.execute(query)

connection.commit()



In [25]:
connection.rollback()

query = """


create table stage_2_customers (
  stage_id serial,
  customer_id numeric(6),
  first_name varchar(32),
  last_name varchar(32),
  street varchar(32),
  city varchar(32),
  state varchar(2),
  zip varchar(5),
  closest_store_id numeric(6),
  distance numeric(3)
);

create table stage_2_sales (
  stage_id serial,
  store_id numeric(6),
  sale_id numeric(8),
  customer_id numeric(6),
  sale_date date,
  total_amount numeric(5)
);

create table stage_2_line_items (
  stage_id serial,
  store_id numeric(6),
  sale_id numeric(8),
  line_item_id numeric(3),
  product_id numeric(3),
  quantity numeric(3)
);

"""

cursor.execute(query)

connection.commit()



In [26]:
connection.rollback()

query = """

insert into stage_2_customers
(stage_id, customer_id, first_name, last_name, street, city, state, zip, closest_store_id, distance)
select stage_id,
       customer_id::numeric,
       first_name,
       last_name, 
       street,
       city,
       state,
       zip,
       closest_store_id::numeric,
       distance::numeric
from stage_1_customers
order by stage_id;

"""

cursor.execute(query)

connection.commit()



In [27]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_2_customers
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,1,563,Rose,Slimings,38 Iowa Street,Berkeley,CA,94704,1,1
1,2,1597,Norry,Macauley,654 Sommers Plaza,Oakland,CA,94612,1,3
2,3,1958,Theresina,Penswick,5975 Twin Pines Hill,Berkeley,CA,94707,1,3
3,4,1991,Kevon,Wickett,472 Arizona Court,Berkeley,CA,94707,1,3
4,5,3491,Siouxie,M'Quharge,747 Westridge Center,Alameda,CA,94501,1,6
5,6,4159,Cheryl,Broe,7 Ruskin Alley,El Sobrante,CA,94803,1,7
6,7,4198,Andreana,Drew,11039 Cordelia Alley,El Sobrante,CA,94803,1,7
7,8,4260,Dom,Risbrough,3 Northland Crossing,Richmond,CA,94805,1,7
8,9,5394,Katharina,Bavester,522 Cordelia Lane,San Francisco,CA,94102,1,10
9,10,6782,Lyndsay,Iuorio,4 Thackeray Road,Walnut Creek,CA,94596,1,12


## You try it - copy data from stage_1_sales to stage_2_sales and from stage_1_line_items to stage_2_line_items; query to verify

In [28]:
connection.rollback()

query = """

insert into stage_2_sales
(stage_id, store_id, sale_id, customer_id, sale_date, total_amount)
select stage_id,
       store_id::numeric, 
       sale_id::numeric, 
       customer_id::numeric, 
       sale_date::date, 
       total_amount::numeric
from stage_1_sales
order by stage_id;

"""

cursor.execute(query)

connection.commit()


In [29]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_2_sales
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,customer_id,sale_date,total_amount
0,1,1,128112,3491,2020-04-30,24
1,2,1,144249,1597,2020-05-16,84
2,3,1,163141,4159,2020-06-04,96
3,4,1,169216,4198,2020-06-09,144
4,5,1,179181,5394,2020-06-18,48
5,6,1,181897,1958,2020-06-20,48
6,7,1,248269,4260,2020-08-22,60
7,8,1,250031,6782,2020-08-23,24
8,9,1,255285,563,2020-08-29,36
9,10,1,263524,1991,2020-09-07,48


In [30]:
connection.rollback()

query = """

insert into stage_2_line_items
(stage_id, store_id, sale_id, line_item_id, product_id, quantity)
select stage_id, 
       store_id::numeric, 
       sale_id::numeric, 
       line_item_id::numeric, 
       product_id::numeric, 
       quantity::numeric
from stage_1_line_items
order by stage_id;

"""

cursor.execute(query)

connection.commit()


In [31]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from stage_2_line_items
order by stage_id;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,store_id,sale_id,line_item_id,product_id,quantity
0,1,1,128112,1,1,1
1,2,1,128112,2,8,1
2,3,1,144249,1,1,1
3,4,1,144249,2,2,1
4,5,1,144249,3,4,2
...,...,...,...,...,...,...
171,172,5,147541,4,4,1
172,173,5,147541,5,7,1
173,174,5,157919,1,1,1
174,175,5,157919,2,4,2
