In [1]:
using IterTools
using DataFrames
using LibPQ
using BenchmarkTools
using CSV
using Tables

## Download Test Data

In [5]:
download("https://nyc-tlc.s3.amazonaws.com/trip+data/green_tripdata_2019-12.csv", 
    "test_data.csv")

"test_data.csv"

## Create Connection

In [2]:
con_str = "postgres://postgres:python_tutorial_5432@192.168.1.24:15432/postgres"

"postgres://postgres:python_tutorial_5432@192.168.1.24:15432/postgres"

In [3]:
con = LibPQ.Connection(con_str)

PostgreSQL connection (CONNECTION_OK) with parameters:
  user = postgres
  password = ********************
  dbname = postgres
  host = 192.168.1.24
  port = 15432
  client_encoding = UTF8
  options = -c DateStyle=ISO,YMD -c IntervalStyle=iso_8601 -c TimeZone=UTC
  application_name = LibPQ.jl
  sslmode = prefer
  sslcompression = 0
  gssencmode = disable
  target_session_attrs = any

## Create Test Table

In [4]:
execute(con, "drop table test_df")

PostgreSQL result

In [5]:
sql = """
create table test_df (
    trip_reason varchar,
    lpep_pickup_datetime timestamp, 
    passenger_count int, 
    trip_distance numeric
);"""
execute(con, sql)

PostgreSQL result

## Create Test Data

In [6]:
df = CSV.File("test_data.csv") |> DataFrame;

In [7]:
df_sample = df[:, [:lpep_pickup_datetime, :passenger_count, :trip_distance]];
#dropmissing!(df_sample);

In [8]:
df_sample[!, :trip_reason] .= "for, more; \$, and fun";

In [9]:
names(df_sample)

4-element Array{Symbol,1}:
 :lpep_pickup_datetime
 :passenger_count     
 :trip_distance       
 :trip_reason         

## Definition of Upload Function

In [10]:
_prepare_field(x:: Any) = x
_prepare_field(x:: Missing) = ""
_prepare_field(x:: AbstractString) = string("\"", reduce(replace, (","=>"\\,", "\""=>"\\\"",), init=x), "\"")

_prepare_field (generic function with 3 methods)

In [11]:
"""
    load_by_copy!(table, con:: LibPQ.Connection, tablename:: AbstractString)

Fast data upload using the PostgreSQL `COPY FROM STDIN` method, which is usually much faster,
especially for large data amounts, than SQL Inserts.

`table` must be a Tables.jl compatible data structure.

All columns given in `table` must have corresponding fields in the target DB table,
the order of the columns does not matter.

Columns in the target DB table, which are not provided by the input `table`, are filled 
with `null` (provided they are nullable).
"""
function load_by_copy!(table, con:: LibPQ.Connection, tablename:: AbstractString)
    row_names = join(string.(Tables.columnnames(table)), ",")
    row_strings = imap(Tables.eachrow(table)) do row
        join((_prepare_field(x) for x in row), ",")*"\n"
    end
    copyin = LibPQ.CopyIn("COPY $tablename ($row_names) FROM STDIN (FORMAT CSV);", row_strings)
    execute(con, copyin)
end      

load_by_copy!

## Test

In [12]:
execute(con, "delete from test_df;")

PostgreSQL result

In [13]:
@time load_by_copy!(df_sample, con, "test_df")

  6.944527 seconds (25.98 M allocations: 1.008 GiB, 3.09% gc time)


PostgreSQL result

In [14]:
@time load_by_copy!(df_sample[!, reverse(names(df_sample))], con, "test_df")

  6.356660 seconds (25.54 M allocations: 1023.760 MiB, 2.76% gc time)


PostgreSQL result

In [15]:
execute(con, "select count(*) from test_df") |> DataFrame

Unnamed: 0_level_0,count
Unnamed: 0_level_1,Int64⍰
1,901254


## Comparison to SQL Insert

In [16]:
@time begin
    execute(con, "BEGIN;")
    LibPQ.load!(df_sample, con,
        """INSERT INTO test_df (lpep_pickup_datetime, passenger_count, 
            trip_distance, trip_reason) VALUES (\$1, \$2, \$3, \$4);""")
    execute(con, "COMMIT;")
end

 97.618149 seconds (38.33 M allocations: 2.109 GiB, 0.86% gc time)


PostgreSQL result

In [17]:
execute(con, "select count(*) from test_df") |> DataFrame

Unnamed: 0_level_0,count
Unnamed: 0_level_1,Int64⍰
1,1351881


The `load_by_copy!` method is more than 10 times faster for inserting data into PostgreSQL!

## Close Connection

In [18]:
close(con)