# Data Wrangling 1.2 Solutions

In [1]:
import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

## You try it - read and print out some lines from the following csv files: temp_line_items.csv, temp_customers.csv, temp_products, temp_holidays

In [5]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [6]:
my_read_csv_file("temp_line_items.csv", limit=10)

['store_id', 'sale_id', 'line_item_id', 'product_id', 'quantity']
['1', '128112', '1', '1', '1']
['1', '128112', '2', '8', '1']
['1', '144249', '1', '1', '1']
['1', '144249', '2', '2', '1']
['1', '144249', '3', '4', '2']
['1', '144249', '4', '6', '1']
['1', '144249', '5', '8', '2']
['1', '163141', '1', '1', '3']
['1', '163141', '2', '3', '1']

Printed  10 lines of  177 total lines.


In [7]:
my_read_csv_file("temp_customers.csv", limit=10)

['customer_id', 'first_name', 'last_name', 'street', 'city', 'state', 'zip', 'closest_store_id', 'distance']
['563', 'Rose', 'Slimings', '38 Iowa Street', 'Berkeley', 'CA', '94704', '1', '1']
['1597', 'Norry', 'Macauley', '654 Sommers Plaza', 'Oakland', 'CA', '94612', '1', '3']
['1958', 'Theresina', 'Penswick', '5975 Twin Pines Hill', 'Berkeley', 'CA', '94707', '1', '3']
['1991', 'Kevon', 'Wickett', '472 Arizona Court', 'Berkeley', 'CA', '94707', '1', '3']
['3491', 'Siouxie', "M'Quharge", '747 Westridge Center', 'Alameda', 'CA', '94501', '1', '6']
['4159', 'Cheryl', 'Broe', '7 Ruskin Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4198', 'Andreana', 'Drew', '11039 Cordelia Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4260', 'Dom', 'Risbrough', '3 Northland Crossing', 'Richmond', 'CA', '94805', '1', '7']
['5394', 'Katharina', 'Bavester', '522 Cordelia Lane', 'San Francisco', 'CA', '94102', '1', '10']

Printed  10 lines of  51 total lines.


In [8]:
my_read_csv_file("temp_products.csv", limit=10)

['product_id', 'description']
['1', 'Pistachio Salmon']
['2', 'Teriyaki Chicken']
['3', 'Spinach Orzo']
['4', 'Eggplant Lasagna']
['5', 'Chicken Salad']
['6', 'Curry Chicken']
['7', 'Tilapia Piccata']
['8', 'Brocolli Stir Fry']

Printed  9 lines of  9 total lines.


In [9]:
my_read_csv_file("temp_holidays.csv", limit=100)

['holiday_date', 'description', 'closed_flag']
['2020-01-01', "New Year's Day", 'f']
['2020-01-20', 'MLK Day', 'f']
['2020-02-17', "President's Day", 'f']
['2020-04-12', 'Easter', 'f']
['2020-05-10', "Mother's Day", 'f']
['2020-05-25', 'Memorial Day', 'f']
['2020-06-21', "Father's Day", 'f']
['2020-07-04', 'Independence Day', 'f']
['2020-09-07', 'Labor Day', 'f']
['2020-11-11', 'Veterans Days', 'f']
['2020-11-26', 'Thanksgiving', 't']
['2020-12-25', 'Christmas', 't']

Printed  13 lines of  13 total lines.


## You try it - 
* load the file temp_holidays.csv into the table temp_holidays 
* temp_products.csv into table temp_products
* temp_line_items into table temp_line_items 
* verify the loads with a query

In [10]:
#
# load the csv files into the database tables in foreign key order
#

connection.rollback()

query = """

copy temp_holidays
from '/user/labs/week_06/temp_holidays.csv' delimiter ',' NULL '' csv header;

copy temp_products
from '/user/labs/week_06/temp_products.csv' delimiter ',' NULL '' csv header;

copy temp_line_items
from '/user/labs/week_06/temp_line_items.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)

connection.commit()

UniqueViolation: duplicate key value violates unique constraint "temp_holidays_pkey"
DETAIL:  Key (holiday_date)=(2020-01-01) already exists.
CONTEXT:  COPY temp_holidays, line 2


In [11]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_holidays;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,holiday_date,description,closed_flag
0,2020-01-01,New Year's Day,False
1,2020-01-20,MLK Day,False
2,2020-02-17,President's Day,False
3,2020-04-12,Easter,False
4,2020-05-10,Mother's Day,False
5,2020-05-25,Memorial Day,False
6,2020-06-21,Father's Day,False
7,2020-07-04,Independence Day,False
8,2020-09-07,Labor Day,False
9,2020-11-11,Veterans Days,False


In [12]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_products;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,product_id,description
0,1,Pistachio Salmon
1,2,Teriyaki Chicken
2,3,Spinach Orzo
3,4,Eggplant Lasagna
4,5,Chicken Salad
5,6,Curry Chicken
6,7,Tilapia Piccata
7,8,Brocolli Stir Fry


In [13]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_line_items;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,store_id,sale_id,line_item_id,product_id,quantity
0,1,128112,1,1,1
1,1,128112,2,8,1
2,1,144249,1,1,1
3,1,144249,2,2,1
4,1,144249,3,4,2
...,...,...,...,...,...
171,5,147541,4,4,1
172,5,147541,5,7,1
173,5,157919,1,1,1
174,5,157919,2,4,2


## You try it - 
* extract the table temp_sales table to temp_sales_2.csv
* table temp_line_items to temp_line_items_2.csv
* table temp_customers to temp_customers_2.csv 
* table temp_products to temp_products_2.csv
* table temp_holidays to temp_holidays_2.csv
* verify by reading the csv files

In [14]:
connection.rollback()
    
query = """
    
copy (select * 
      from temp_sales 
      order by store_id, sale_id)
to '/user/labs/week_06/temp_sales_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_line_items 
      order by store_id, sale_id, line_item_id)
to '/user/labs/week_06/temp_line_items_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_customers
      order by customer_id)
to '/user/labs/week_06/temp_customers_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_products
      order by product_id)
to '/user/labs/week_06/temp_products_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_holidays
      order by holiday_date)
to '/user/labs/week_06/temp_holidays_2.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)
    
connection.commit()


In [15]:
my_read_csv_file("temp_sales_2.csv", limit=10)

['store_id', 'sale_id', 'customer_id', 'sale_date', 'total_amount']
['1', '128112', '3491', '2020-04-30', '24']
['1', '144249', '1597', '2020-05-16', '84']
['1', '163141', '4159', '2020-06-04', '96']
['1', '169216', '4198', '2020-06-09', '144']
['1', '179181', '5394', '2020-06-18', '48']
['1', '181897', '1958', '2020-06-20', '48']
['1', '248269', '4260', '2020-08-22', '60']
['1', '250031', '6782', '2020-08-23', '24']
['1', '255285', '563', '2020-08-29', '36']

Printed  10 lines of  51 total lines.


In [16]:
my_read_csv_file("temp_line_items_2.csv", limit=10)

['store_id', 'sale_id', 'line_item_id', 'product_id', 'quantity']
['1', '128112', '1', '1', '1']
['1', '128112', '2', '8', '1']
['1', '144249', '1', '1', '1']
['1', '144249', '2', '2', '1']
['1', '144249', '3', '4', '2']
['1', '144249', '4', '6', '1']
['1', '144249', '5', '8', '2']
['1', '163141', '1', '1', '3']
['1', '163141', '2', '3', '1']

Printed  10 lines of  177 total lines.


In [17]:
my_read_csv_file("temp_customers_2.csv", limit=10)

['customer_id', 'first_name', 'last_name', 'street', 'city', 'state', 'zip', 'closest_store_id', 'distance']
['563', 'Rose', 'Slimings', '38 Iowa Street', 'Berkeley', 'CA', '94704', '1', '1']
['1597', 'Norry', 'Macauley', '654 Sommers Plaza', 'Oakland', 'CA', '94612', '1', '3']
['1958', 'Theresina', 'Penswick', '5975 Twin Pines Hill', 'Berkeley', 'CA', '94707', '1', '3']
['1991', 'Kevon', 'Wickett', '472 Arizona Court', 'Berkeley', 'CA', '94707', '1', '3']
['3491', 'Siouxie', "M'Quharge", '747 Westridge Center', 'Alameda', 'CA', '94501', '1', '6']
['4159', 'Cheryl', 'Broe', '7 Ruskin Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4198', 'Andreana', 'Drew', '11039 Cordelia Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4260', 'Dom', 'Risbrough', '3 Northland Crossing', 'Richmond', 'CA', '94805', '1', '7']
['5394', 'Katharina', 'Bavester', '522 Cordelia Lane', 'San Francisco', 'CA', '94102', '1', '10']

Printed  10 lines of  51 total lines.


In [18]:
my_read_csv_file("temp_products_2.csv", limit=10)

['product_id', 'description']
['1', 'Pistachio Salmon']
['2', 'Teriyaki Chicken']
['3', 'Spinach Orzo']
['4', 'Eggplant Lasagna']
['5', 'Chicken Salad']
['6', 'Curry Chicken']
['7', 'Tilapia Piccata']
['8', 'Brocolli Stir Fry']

Printed  9 lines of  9 total lines.


In [19]:
my_read_csv_file("temp_holidays_2.csv", limit=10)

['holiday_date', 'description', 'closed_flag']
['2020-01-01', "New Year's Day", 'f']
['2020-01-20', 'MLK Day', 'f']
['2020-02-17', "President's Day", 'f']
['2020-04-12', 'Easter', 'f']
['2020-05-10', "Mother's Day", 'f']
['2020-05-25', 'Memorial Day', 'f']
['2020-06-21', "Father's Day", 'f']
['2020-07-04', 'Independence Day', 'f']
['2020-09-07', 'Labor Day', 'f']

Printed  10 lines of  13 total lines.
