# Data Wrangling 1.2

In [1]:
import csv

import math
import numpy as np
import pandas as pd

import psycopg2


In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

# Lab: Reading CSV Files

In [5]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [6]:
my_read_csv_file("temp_stores.csv", limit=10)

['store_id', 'street', 'city', 'state', 'zip', 'latitude', 'longitude']
['1', '3000 Telegraph Ave', 'Berkeley', 'CA', '94705', '37.8555', '-122.2604']
['2', '1001 Broadway', 'Seattle', 'WA', '98122', '47.6114', '-122.3214']
['3', '2510 McKinney Ave', 'Dallas', 'TX', '75201', '32.7958', '-96.8015']
['4', '299 SE 3rd Ave', 'Miami', 'FL', '33131', '25.7720', '-80.1891']
['5', '1202 Broadway', 'Nashville', 'TN', '37203', '36.1568', '-86.7881']

Printed  6 lines of  6 total lines.


In [7]:
my_read_csv_file("temp_sales.csv", limit=10)

['store_id', 'sale_id', 'customer_id', 'sale_date', 'total_amount']
['1', '128112', '3491', '2020-04-30', '24']
['1', '144249', '1597', '2020-05-16', '84']
['1', '163141', '4159', '2020-06-04', '96']
['1', '169216', '4198', '2020-06-09', '144']
['1', '179181', '5394', '2020-06-18', '48']
['1', '181897', '1958', '2020-06-20', '48']
['1', '248269', '4260', '2020-08-22', '60']
['1', '250031', '6782', '2020-08-23', '24']
['1', '255285', '563', '2020-08-29', '36']

Printed  10 lines of  51 total lines.


In [8]:
my_read_csv_file("temp_random_sales.csv", limit=100)

['store_id', 'sale_id']
['1', '128112']
['1', '144249']
['1', '163141']
['1', '169216']
['1', '179181']
['1', '181897']
['1', '248269']
['1', '250031']
['1', '255285']
['1', '263524']
['2', '105004']
['2', '109083']
['2', '115446']
['2', '115912']
['2', '119996']
['2', '144107']
['2', '158290']
['2', '203726']
['2', '217889']
['2', '218323']
['3', '99402']
['3', '131547']
['3', '131827']
['3', '134868']
['3', '154102']
['3', '164376']
['3', '177427']
['3', '183488']
['3', '186881']
['3', '209184']
['4', '89887']
['4', '111979']
['4', '115633']
['4', '137286']
['4', '138546']
['4', '143587']
['4', '177033']
['4', '177233']
['4', '183530']
['4', '192337']
['5', '88989']
['5', '104373']
['5', '105691']
['5', '107715']
['5', '109305']
['5', '126722']
['5', '136553']
['5', '136735']
['5', '147541']
['5', '157919']

Printed  51 lines of  51 total lines.


## You try it - read and print out some lines from the following csv files: temp_line_items.csv, temp_customers.csv, temp_products, temp_holidays

In [9]:
my_read_csv_file("temp_line_items.csv", limit=10)

['store_id', 'sale_id', 'line_item_id', 'product_id', 'quantity']
['1', '128112', '1', '1', '1']
['1', '128112', '2', '8', '1']
['1', '144249', '1', '1', '1']
['1', '144249', '2', '2', '1']
['1', '144249', '3', '4', '2']
['1', '144249', '4', '6', '1']
['1', '144249', '5', '8', '2']
['1', '163141', '1', '1', '3']
['1', '163141', '2', '3', '1']

Printed  10 lines of  177 total lines.


In [10]:
my_read_csv_file("temp_customers.csv", limit=10)

['customer_id', 'first_name', 'last_name', 'street', 'city', 'state', 'zip', 'closest_store_id', 'distance']
['563', 'Rose', 'Slimings', '38 Iowa Street', 'Berkeley', 'CA', '94704', '1', '1']
['1597', 'Norry', 'Macauley', '654 Sommers Plaza', 'Oakland', 'CA', '94612', '1', '3']
['1958', 'Theresina', 'Penswick', '5975 Twin Pines Hill', 'Berkeley', 'CA', '94707', '1', '3']
['1991', 'Kevon', 'Wickett', '472 Arizona Court', 'Berkeley', 'CA', '94707', '1', '3']
['3491', 'Siouxie', "M'Quharge", '747 Westridge Center', 'Alameda', 'CA', '94501', '1', '6']
['4159', 'Cheryl', 'Broe', '7 Ruskin Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4198', 'Andreana', 'Drew', '11039 Cordelia Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4260', 'Dom', 'Risbrough', '3 Northland Crossing', 'Richmond', 'CA', '94805', '1', '7']
['5394', 'Katharina', 'Bavester', '522 Cordelia Lane', 'San Francisco', 'CA', '94102', '1', '10']

Printed  10 lines of  51 total lines.


In [11]:
my_read_csv_file("temp_products.csv", limit=10)

['product_id', 'description']
['1', 'Pistachio Salmon']
['2', 'Teriyaki Chicken']
['3', 'Spinach Orzo']
['4', 'Eggplant Lasagna']
['5', 'Chicken Salad']
['6', 'Curry Chicken']
['7', 'Tilapia Piccata']
['8', 'Brocolli Stir Fry']

Printed  9 lines of  9 total lines.


In [12]:
my_read_csv_file("temp_holidays.csv", limit=10)

['holiday_date', 'description', 'closed_flag']
['2020-01-01', "New Year's Day", 'f']
['2020-01-20', 'MLK Day', 'f']
['2020-02-17', "President's Day", 'f']
['2020-04-12', 'Easter', 'f']
['2020-05-10', "Mother's Day", 'f']
['2020-05-25', 'Memorial Day', 'f']
['2020-06-21', "Father's Day", 'f']
['2020-07-04', 'Independence Day', 'f']
['2020-09-07', 'Labor Day', 'f']

Printed  10 lines of  13 total lines.


# Lab: Loading CSV Data into Database Tables

In [13]:
#
# drop all the temp tables in the foreign key order
#

connection.rollback()

query = """

drop table if exists temp_line_items;
drop table if exists temp_sales;
drop table if exists temp_products;
drop table if exists temp_customers;
drop table if exists temp_stores;
drop table if exists temp_holidays;
drop table if exists temp_random_sales;


"""

cursor.execute(query)

connection.commit()



In [14]:
#
# create all the temp tables in the foreign key order
#

connection.rollback()

query = """

create table temp_holidays (
  holiday_date date,
  description varchar(32),
  closed_flag boolean,
  primary key (holiday_date)
);

create table temp_products (
  product_id numeric(3),
  description varchar(32),
  primary key (product_id)
);

create table temp_stores (
  store_id numeric(6),
  street varchar(32),
  city varchar(32),
  state varchar(2),
  zip varchar(5),
  latitude numeric(7,4),
  longitude numeric(7,4),
  primary key (store_id)
);

create table temp_customers (
  customer_id numeric(6),
  first_name varchar(32),
  last_name varchar(32),
  street varchar(32),
  city varchar(32),
  state varchar(2),
  zip varchar(5),
  closest_store_id numeric(6),
  distance numeric(3),
  primary key (customer_id),
  foreign key (closest_store_id) references temp_stores(store_id)
);

create table temp_sales (
  store_id numeric(6),
  sale_id numeric(8),
  customer_id numeric(6),
  sale_date date,
  total_amount numeric(5),
  primary key (store_id, sale_id),
  foreign key (customer_id) references temp_customers (customer_id)
);

create table temp_line_items (
  store_id numeric(6),
  sale_id numeric(8),
  line_item_id numeric(3),
  product_id numeric(3),
  quantity numeric(3),
  primary key (store_id, sale_id, line_item_id),
  foreign key (product_id) references temp_products (product_id)
);

create table temp_random_sales (
  store_id numeric(6),
  sale_id numeric(8)
)

"""

cursor.execute(query)

connection.commit()

In [15]:
#
# load the csv files into the database tables in foreign key order
#

connection.rollback()

query = """

copy temp_stores
from '/user/labs/week_06/temp_stores.csv' delimiter ',' NULL '' csv header;

copy temp_customers
from '/user/labs/week_06/temp_customers.csv' delimiter ',' NULL '' csv header;

copy temp_sales
from '/user/labs/week_06/temp_sales.csv' delimiter ',' NULL '' csv header;

copy temp_random_sales
from '/user/labs/week_06/temp_random_sales.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)

connection.commit()

In [16]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_stores;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,store_id,street,city,state,zip,latitude,longitude
0,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604
1,2,1001 Broadway,Seattle,WA,98122,47.6114,-122.3214
2,3,2510 McKinney Ave,Dallas,TX,75201,32.7958,-96.8015
3,4,299 SE 3rd Ave,Miami,FL,33131,25.772,-80.1891
4,5,1202 Broadway,Nashville,TN,37203,36.1568,-86.7881


In [17]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_customers;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,563,Rose,Slimings,38 Iowa Street,Berkeley,CA,94704,1,1
1,1597,Norry,Macauley,654 Sommers Plaza,Oakland,CA,94612,1,3
2,1958,Theresina,Penswick,5975 Twin Pines Hill,Berkeley,CA,94707,1,3
3,1991,Kevon,Wickett,472 Arizona Court,Berkeley,CA,94707,1,3
4,3491,Siouxie,M'Quharge,747 Westridge Center,Alameda,CA,94501,1,6
5,4159,Cheryl,Broe,7 Ruskin Alley,El Sobrante,CA,94803,1,7
6,4198,Andreana,Drew,11039 Cordelia Alley,El Sobrante,CA,94803,1,7
7,4260,Dom,Risbrough,3 Northland Crossing,Richmond,CA,94805,1,7
8,5394,Katharina,Bavester,522 Cordelia Lane,San Francisco,CA,94102,1,10
9,6782,Lyndsay,Iuorio,4 Thackeray Road,Walnut Creek,CA,94596,1,12


In [18]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_sales;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,store_id,sale_id,customer_id,sale_date,total_amount
0,1,128112,3491,2020-04-30,24
1,1,144249,1597,2020-05-16,84
2,1,163141,4159,2020-06-04,96
3,1,169216,4198,2020-06-09,144
4,1,179181,5394,2020-06-18,48
5,1,181897,1958,2020-06-20,48
6,1,248269,4260,2020-08-22,60
7,1,250031,6782,2020-08-23,24
8,1,255285,563,2020-08-29,36
9,1,263524,1991,2020-09-07,48


In [19]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_random_sales;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,store_id,sale_id
0,1,128112
1,1,144249
2,1,163141
3,1,169216
4,1,179181
5,1,181897
6,1,248269
7,1,250031
8,1,255285
9,1,263524


## You try it - 
* load the file temp_holidays.csv into the table temp_holidays 
* temp_products.csv into table temp_products
* temp_line_items into table temp_line_items 
* verify the loads with a query

In [20]:
connection.rollback()

query = """

copy temp_holidays
from '/user/labs/week_06/temp_holidays.csv' delimiter ',' NULL '' csv header;

copy temp_products
from '/user/labs/week_06/temp_products.csv' delimiter ',' NULL '' csv header;

copy temp_line_items
from '/user/labs/week_06/temp_line_items.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [21]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_holidays;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,holiday_date,description,closed_flag
0,2020-01-01,New Year's Day,False
1,2020-01-20,MLK Day,False
2,2020-02-17,President's Day,False
3,2020-04-12,Easter,False
4,2020-05-10,Mother's Day,False
5,2020-05-25,Memorial Day,False
6,2020-06-21,Father's Day,False
7,2020-07-04,Independence Day,False
8,2020-09-07,Labor Day,False
9,2020-11-11,Veterans Days,False


In [22]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_products;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,product_id,description
0,1,Pistachio Salmon
1,2,Teriyaki Chicken
2,3,Spinach Orzo
3,4,Eggplant Lasagna
4,5,Chicken Salad
5,6,Curry Chicken
6,7,Tilapia Piccata
7,8,Brocolli Stir Fry


In [23]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_line_items;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,store_id,sale_id,line_item_id,product_id,quantity
0,1,128112,1,1,1
1,1,128112,2,8,1
2,1,144249,1,1,1
3,1,144249,2,2,1
4,1,144249,3,4,2
...,...,...,...,...,...
171,5,147541,4,4,1
172,5,147541,5,7,1
173,5,157919,1,1,1
174,5,157919,2,4,2


# Lab: Extracting CSV Files

In [24]:
connection.rollback()
    
query = """
    
copy (select * 
      from temp_random_sales
      order by store_id, sale_id)
to '/user/labs/week_06/temp_random_sales_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_stores 
      order by store_id)
to '/user/labs/week_06/temp_stores_2.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)
    
connection.commit()


In [25]:
my_read_csv_file("temp_random_sales_2.csv", limit=10)

['store_id', 'sale_id']
['1', '128112']
['1', '144249']
['1', '163141']
['1', '169216']
['1', '179181']
['1', '181897']
['1', '248269']
['1', '250031']
['1', '255285']

Printed  10 lines of  51 total lines.


In [26]:
my_read_csv_file("temp_stores_2.csv", limit=10)

['store_id', 'street', 'city', 'state', 'zip', 'latitude', 'longitude']
['1', '3000 Telegraph Ave', 'Berkeley', 'CA', '94705', '37.8555', '-122.2604']
['2', '1001 Broadway', 'Seattle', 'WA', '98122', '47.6114', '-122.3214']
['3', '2510 McKinney Ave', 'Dallas', 'TX', '75201', '32.7958', '-96.8015']
['4', '299 SE 3rd Ave', 'Miami', 'FL', '33131', '25.7720', '-80.1891']
['5', '1202 Broadway', 'Nashville', 'TN', '37203', '36.1568', '-86.7881']

Printed  6 lines of  6 total lines.


## You try it - 
* extract the table temp_sales table to temp_sales_2.csv
* table temp_line_items to temp_line_items_2.csv
* table temp_customers to temp_customers_2.csv 
* table temp_products to temp_products_2.csv
* table temp_holidays to temp_holidays_2.csv
* verify by reading the csv files

In [27]:
connection.rollback()
    
query = """
    
copy (select * 
      from temp_sales
      order by store_id, sale_id)
to '/user/labs/week_06/temp_sales_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_line_items 
      order by store_id)
to '/user/labs/week_06/temp_line_items_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_customers 
      order by customer_id)
to '/user/labs/week_06/temp_customers_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_products 
      order by product_id)
to '/user/labs/week_06/temp_products_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_holidays 
      order by holiday_date)
to '/user/labs/week_06/temp_holidays_2.csv' delimiter ',' NULL '' csv header;
"""

cursor.execute(query)
    
connection.commit()

In [28]:
my_read_csv_file("temp_sales_2.csv", limit=10)

['store_id', 'sale_id', 'customer_id', 'sale_date', 'total_amount']
['1', '128112', '3491', '2020-04-30', '24']
['1', '144249', '1597', '2020-05-16', '84']
['1', '163141', '4159', '2020-06-04', '96']
['1', '169216', '4198', '2020-06-09', '144']
['1', '179181', '5394', '2020-06-18', '48']
['1', '181897', '1958', '2020-06-20', '48']
['1', '248269', '4260', '2020-08-22', '60']
['1', '250031', '6782', '2020-08-23', '24']
['1', '255285', '563', '2020-08-29', '36']

Printed  10 lines of  51 total lines.


In [29]:
my_read_csv_file("temp_line_items_2.csv", limit=10)

['store_id', 'sale_id', 'line_item_id', 'product_id', 'quantity']
['1', '128112', '1', '1', '1']
['1', '128112', '2', '8', '1']
['1', '144249', '1', '1', '1']
['1', '144249', '2', '2', '1']
['1', '144249', '3', '4', '2']
['1', '144249', '4', '6', '1']
['1', '144249', '5', '8', '2']
['1', '163141', '1', '1', '3']
['1', '163141', '2', '3', '1']

Printed  10 lines of  177 total lines.


In [30]:
my_read_csv_file("temp_customers_2.csv", limit=10)

['customer_id', 'first_name', 'last_name', 'street', 'city', 'state', 'zip', 'closest_store_id', 'distance']
['563', 'Rose', 'Slimings', '38 Iowa Street', 'Berkeley', 'CA', '94704', '1', '1']
['1597', 'Norry', 'Macauley', '654 Sommers Plaza', 'Oakland', 'CA', '94612', '1', '3']
['1958', 'Theresina', 'Penswick', '5975 Twin Pines Hill', 'Berkeley', 'CA', '94707', '1', '3']
['1991', 'Kevon', 'Wickett', '472 Arizona Court', 'Berkeley', 'CA', '94707', '1', '3']
['3491', 'Siouxie', "M'Quharge", '747 Westridge Center', 'Alameda', 'CA', '94501', '1', '6']
['4159', 'Cheryl', 'Broe', '7 Ruskin Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4198', 'Andreana', 'Drew', '11039 Cordelia Alley', 'El Sobrante', 'CA', '94803', '1', '7']
['4260', 'Dom', 'Risbrough', '3 Northland Crossing', 'Richmond', 'CA', '94805', '1', '7']
['5394', 'Katharina', 'Bavester', '522 Cordelia Lane', 'San Francisco', 'CA', '94102', '1', '10']

Printed  10 lines of  51 total lines.


In [31]:
my_read_csv_file("temp_products_2.csv", limit=10)

['product_id', 'description']
['1', 'Pistachio Salmon']
['2', 'Teriyaki Chicken']
['3', 'Spinach Orzo']
['4', 'Eggplant Lasagna']
['5', 'Chicken Salad']
['6', 'Curry Chicken']
['7', 'Tilapia Piccata']
['8', 'Brocolli Stir Fry']

Printed  9 lines of  9 total lines.


In [32]:
my_read_csv_file("temp_holidays_2.csv", limit=10)

['holiday_date', 'description', 'closed_flag']
['2020-01-01', "New Year's Day", 'f']
['2020-01-20', 'MLK Day', 'f']
['2020-02-17', "President's Day", 'f']
['2020-04-12', 'Easter', 'f']
['2020-05-10', "Mother's Day", 'f']
['2020-05-25', 'Memorial Day', 'f']
['2020-06-21', "Father's Day", 'f']
['2020-07-04', 'Independence Day', 'f']
['2020-09-07', 'Labor Day', 'f']

Printed  10 lines of  13 total lines.
