# Setup

In [1]:
from piper.defaults import *
from piper.verbs import *
from piper.odbc import connections, connect
from piper.sql import insert, create_table
from psycopg2 import Error
import math

piper v0.1.2: Friday, 09 April 2021 18:49:45


# Sample data - 1 million rows

In [2]:
df = pd.read_csv('inputs/1000000 Sales Records.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'inputs/1000000 Sales Records.csv'

## Clean data

Trim column names of leading and trailing spaces, 
Replace embedded spaces with underscore and lowercase.<br>
Why? To make it easier to work with columns using pandas dot notation<br> 
(e.g. instead of __df['Order Date']__ one can use __df.order_date__)

In [3]:
df = clean_names(df) 

In [4]:
df.order_date = pd.to_datetime(df.order_date, format='%m/%d/%Y')
df.ship_date = pd.to_datetime(df.ship_date, format='%m/%d/%Y')

In [5]:
head(df, 1)

1000000 rows, 14 columns


Unnamed: 0,region,country,item_type,sales_channel,order_priority,order_date,order_id,ship_date,units_sold,unit_price,unit_cost,total_revenue,total_cost,total_profit
0,Sub-Saharan Africa,South Africa,Fruits,Offline,M,2012-07-27,443368995,2012-07-28,1593,9.33,6.92,14862.69,11023.56,3839.13


# Postgres DB example

## Create table in target database

Using the function 'create_table()', the required SQL statements to build 
a table can be generated, see below example output.

In [6]:
print(create_table(df, tablename='example_table'))

create table example_table (
region text, 
country text, 
item_type text, 
sales_channel text, 
order_priority text, 
order_date date, 
order_id bigint, 
ship_date date, 
units_sold bigint, 
unit_price decimal, 
unit_cost decimal, 
total_revenue decimal, 
total_cost decimal, 
total_profit decimal
) 


## Connect and insert data

Optionally, split and process data into multiple parts/pieces.

In [7]:
list_dataframes = np.array_split(df, indices_or_sections=10)

In [8]:
table_name = 'testtable'

sql = [f'''drop table if exists {table_name}''', 
       create_table(df, table_name)]
sql = '; '.join(sql)

with connect(connection='Connection1') as con:

    try:
        cursor = con.cursor()
        cursor.execute(sql)
        logger.debug(sql)

        for dx in list_dataframes:
            
            sql = insert(dx, table_name, info=False)
            logger.debug(sql)
            
            cursor.execute(sql)
            logger.info(f'{dx.shape[0]} rows inserted into {table_name} table.')

    except (Exception, Error) as error:
        logger.info(f"Error while connecting to {con}", error)
    finally:
        if con:
            cursor.close()
            logger.info("Connection closed")

Connection: localhost, user:mike


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


100000 rows inserted into testtable table.


Connection closed


## Check data

In [9]:
sql = f'''select * from {table_name}'''
df = read_sql(sql=sql, con=con, sql_info=False, info=False)
head(df, 2)

1000000 rows, 14 columns


Unnamed: 0,region,country,item_type,sales_channel,order_priority,order_date,order_id,ship_date,units_sold,unit_price,unit_cost,total_revenue,total_cost,total_profit
0,Sub-Saharan Africa,South Africa,Fruits,Offline,M,2012-07-27,443368995,2012-07-28,1593,9.33,6.92,14862.69,11023.56,3839.13
1,Middle East and North Africa,Morocco,Clothes,Online,M,2013-09-14,667593514,2013-10-19,4611,109.28,35.84,503890.08,165258.24,338631.84
