lpsmith · joeyadams · Mar 8, 2012 · Mar 8, 2012 · Mar 8, 2012 · Mar 8, 2012
diff --git a/Database/PostgreSQL/LibPQ.hsc b/Database/PostgreSQL/LibPQ.hsc
@@ -143,6 +143,16 @@ module Database.PostgreSQL.LibPQ
     , escapeByteaConn
     , unescapeBytea
 
+    -- * Using COPY FROM
+    -- $copyfrom
+    , CopyResult(..)
+    , putCopyData
+    , putCopyEnd
+
+    -- ** Formatting datums
+    , formatCopyRow
+    , putCopyRow
+
     -- * Asynchronous Command Processing
     -- $asynccommand
     , sendQuery
@@ -203,6 +213,9 @@ import Prelude hiding ( print )
 import Foreign
 import Foreign.C.Types
 import Foreign.C.String
+#if __GLASGOW_HASKELL__ >= 702
+import qualified Foreign.ForeignPtr.Unsafe as Unsafe
+#endif
 import qualified Foreign.Concurrent as FC
 import System.Posix.Types ( Fd(..) )
 import Data.List ( foldl' )
@@ -290,7 +303,11 @@ newNullConnection = Conn `fmap` newForeignPtr_ nullPtr
 
 -- | Test if a connection is the Null Connection.
 isNullConnection :: Connection -> Bool
+#if __GLASGOW_HASKELL__ >= 702
+isNullConnection (Conn x) = Unsafe.unsafeForeignPtrToPtr x == nullPtr
+#else
 isNullConnection (Conn x) = unsafeForeignPtrToPtr x == nullPtr
+#endif
 {-# INLINE isNullConnection #-}
 
 -- | If 'connectStart' succeeds, the next stage is to poll libpq so
@@ -1499,6 +1516,161 @@ unescapeBytea bs =
                     return $ Just $ B.fromForeignPtr tofp 0 $ fromIntegral l
 
 
+-- $copyfrom
+--
+-- This provides support for PostgreSQL's @COPY FROM@ facility.  When inserting
+-- rows in bulk, @COPY FROM@ is faster than individual @INSERT@ statements for
+-- each row.
+--
+-- For more information, see:
+--
+--  * <http://www.postgresql.org/docs/current/static/sql-copy.html>
+--
+--  * <http://www.postgresql.org/docs/current/static/libpq-copy.html>
+--
+-- The following example illustrates the procedure for using @COPY FROM@ with
+-- libpq:
+--
+-- >-- Put the connection in the COPY_IN state
+-- >-- by executing a COPY ... FROM query.
+-- >Just result <- exec conn "COPY foo (a, b) FROM stdin"
+-- >CopyIn <- resultStatus result
+-- >
+-- >-- Send a couple lines of COPY data
+-- >CopyOk <- putCopyRow conn [Just ("1", Text), Just ("one", Text)]
+-- >CopyOk <- putCopyRow conn [Just ("2", Text), Nothing]
+-- >
+-- >-- Send end-of-data indication
+-- >CopyOk <- putCopyEnd conn Nothing
+-- >
+-- >-- Get the final result status of the copy command
+-- >Just result <- getResult conn
+-- >CommandOk <- resultStatus result
+
+data CopyResult = CopyOk            -- ^ The data was sent.
+                | CopyError         -- ^ An error occurred (use 'errorMessage'
+                                    --   to retrieve details).
+                | CopyWouldBlock    -- ^ The data was not sent because the
+                                    --   attempt would block (this case is only
+                                    --   possible if the connection is in
+                                    --   nonblocking mode)  Wait for
+                                    --   write-ready (e.g. by using
+                                    --   'Control.Concurrent.threadWaitWrite'
+                                    --   on the 'socket') and try again.
+
+
+toCopyResult :: CInt -> CopyResult
+toCopyResult n | n < 0     = CopyError
+               | n == 0    = CopyWouldBlock
+               | otherwise = CopyOk
+
+
+-- | Send raw @COPY@ data to the server during the @COPY_IN@ state.
+putCopyData :: Connection -> B.ByteString -> IO CopyResult
+putCopyData conn bs =
+    B.unsafeUseAsCStringLen bs $ putCopyCString conn
+
+
+putCopyCString :: Connection -> CStringLen -> IO CopyResult
+putCopyCString conn (str, len) =
+    fmap toCopyResult $
+        withConn conn $ \ptr -> c_PQputCopyData ptr str (fromIntegral len)
+
+
+-- | Send end-of-data indication to the server during the @COPY_IN@ state.
+--
+--  * @putCopyEnd conn Nothing@ ends the @COPY_IN@ operation successfully.
+--
+--  * @putCopyEnd conn (Just errormsg)@ forces the @COPY@ to fail, with
+--    @errormsg@ used as the error message.
+--
+-- After 'putCopyEnd' returns 'CopyOk', call 'getResult' to obtain the final
+-- result status of the @COPY@ command.  Then return to normal operation.
+putCopyEnd :: Connection -> Maybe B.ByteString -> IO CopyResult
+putCopyEnd conn Nothing =
+    fmap toCopyResult $
+        withConn conn $ \ptr -> c_PQputCopyEnd ptr nullPtr
+putCopyEnd conn (Just errormsg) =
+    fmap toCopyResult $
+        B.useAsCString errormsg $ \errormsg_cstr ->
+            withConn conn $ \ptr -> c_PQputCopyEnd ptr errormsg_cstr
+
+
+-- | A combination of 'putCopyData' and 'formatCopyRow'.  This should be
+-- slightly more efficient than:
+--
+-- >putCopyData conn $ formatCopyRow params
+--
+-- as it does not allocate an intermediate 'B.ByteString'.
+putCopyRow :: Connection -> [Maybe (B.ByteString, Format)] -> IO CopyResult
+putCopyRow conn params = withFormatCopyRow params $ putCopyCString conn
+
+
+-- | Escape a row of data for use with a COPY FROM statement.
+-- Include a trailing newline at the end.
+--
+-- This assumes text format (rather than BINARY or CSV) with the default
+-- delimiter (tab) and default null string (\\N).  A suitable query looks like:
+--
+-- >COPY tablename (id, col1, col2) FROM stdin;
+formatCopyRow :: [Maybe (B.ByteString, Format)] -> IO B.ByteString
+formatCopyRow params = withFormatCopyRow params B.packCStringLen
+
+
+withFormatCopyRow :: [Maybe (B.ByteString, Format)]
+                  -> (CStringLen -> IO a)
+                  -> IO a
+withFormatCopyRow params inner =
+    let bufsize =
+            if null params
+                then 1
+                else sum $ map paramSize params
+     in allocaBytes bufsize $ \buf -> do
+            end <- emitParams buf params
+            let len = end `minusPtr` buf
+            if len <= bufsize
+                then inner (castPtr buf, len)
+                else error $ "formatCopyRow: Buffer overrun (buffer is "
+                          ++ show bufsize ++ " bytes, but "
+                          ++ show len ++ " bytes were written into it)"
+
+
+-- | Compute the maximum number of bytes the escaped datum may take up,
+-- including the trailing tab or newline character.
+paramSize :: Maybe (B.ByteString, Format) -> Int
+paramSize Nothing            = 3 -- Length of "\\N\t"
+paramSize (Just (s, Text))   = B.length s * 2 + 1
+paramSize (Just (s, Binary)) = B.length s * 5 + 1
+
+
+emitParam :: Ptr CUChar -> Maybe (B.ByteString, Format) -> IO (Ptr CUChar)
+emitParam out Nothing = do
+    pokeElemOff out 0 92    -- '\\'
+    pokeElemOff out 1 78    -- 'N'
+    return (out `plusPtr` 2)
+emitParam out (Just (s, Text)) =
+    B.unsafeUseAsCStringLen s $ \(ptr, len) ->
+        c_escape_copy_text (castPtr ptr) (fromIntegral len) out
+emitParam out (Just (s, Binary)) =
+    B.unsafeUseAsCStringLen s $ \(ptr, len) ->
+        c_escape_copy_bytea (castPtr ptr) (fromIntegral len) out
+
+
+emitParams :: Ptr CUChar -> [Maybe (B.ByteString, Format)] -> IO (Ptr CUChar)
+emitParams out [] = do
+    poke out 10 -- newline
+    return (out `plusPtr` 1)
+emitParams out (x:xs) = do
+    out' <- emitParam out x
+    if null xs
+        then do
+            poke out' 10 -- newline
+            return (out' `plusPtr` 1)
+        else do
+            poke out' 9 -- tab
+            emitParams (out' `plusPtr` 1) xs
+
+
 -- $asynccommand
 -- The 'exec' function is adequate for submitting commands in normal,
 -- synchronous applications. It has a couple of deficiencies, however,
@@ -2287,6 +2459,12 @@ type PGVerbosity = CInt
 foreign import ccall unsafe "libpq-fe.h PQsetErrorVerbosity"
     c_PQsetErrorVerbosity :: Ptr PGconn -> PGVerbosity -> IO PGVerbosity
 
+foreign import ccall        "libpq-fe.h PQputCopyData"
+    c_PQputCopyData :: Ptr PGconn -> Ptr CChar -> CInt -> IO CInt
+
+foreign import ccall        "libpq-fe.h PQputCopyEnd"
+    c_PQputCopyEnd :: Ptr PGconn -> CString -> IO CInt
+
 foreign import ccall        "libpq-fe.h PQsendQuery"
     c_PQsendQuery :: Ptr PGconn -> CString ->IO CInt
 
@@ -2496,3 +2674,18 @@ foreign import ccall        "libpq-fs.h lo_close"
 
 foreign import ccall        "libpq-fs.h lo_unlink"
     c_lo_unlink :: Ptr PGconn -> Oid -> IO CInt
+
+------------------------------------------------------------------------
+-- cbits imports
+
+foreign import ccall unsafe
+    c_escape_copy_text :: Ptr CUChar        -- ^ const unsigned char *in
+                       -> CInt              -- ^ int in_size
+                       -> Ptr CUChar        -- ^ unsigned char *out
+                       -> IO (Ptr CUChar)   -- ^ Returns pointer to end of written data
+
+foreign import ccall unsafe
+    c_escape_copy_bytea :: Ptr CUChar       -- ^ const unsigned char *in
+                        -> CInt             -- ^ int in_size
+                        -> Ptr CUChar       -- ^ unsigned char *out
+                        -> IO (Ptr CUChar)  -- ^ Returns pointer to end of written data
diff --git a/cbits/escape-copy.c b/cbits/escape-copy.c
@@ -0,0 +1,94 @@
+/*
+ * Escape a datum for COPY FROM.  The buffer pointed to by @out should be
+ * at least 2*in_size bytes long.
+ *
+ * Return a pointer to the end of the bytes emitted.
+ */
+unsigned char *c_escape_copy_text(const unsigned char *in, int in_size, unsigned char *out)
+{
+    while (in_size-- > 0) {
+        unsigned char c = *in++;
+
+        switch (c) {
+            case '\t':
+                *out++ = '\\';
+                *out++ = 't';
+                break;
+            case '\n':
+                *out++ = '\\';
+                *out++ = 'n';
+                break;
+            case '\r':
+                *out++ = '\\';
+                *out++ = 'r';
+                break;
+            case '\\':
+                *out++ = '\\';
+                *out++ = '\\';
+                break;
+
+            default:
+                *out++ = c;
+        }
+    }
+
+    return out;
+}
+
+/*
+ * Like c_escape_copy_text, but escape the datum so it will be suitable for
+ * PostgreSQL's BYTEA input function.  Note that this does not use the hex
+ * format introduced by PostgreSQL 9.0, as it is readable only by
+ * PostgreSQL 9.0 and up.
+ *
+ * This performs two escape operations:
+ *
+ *  * Convert raw binary data to the format accepted by PostgreSQL's BYTEA
+ *    input function.
+ *
+ *  * Escape the result for use in COPY FROM data.
+ *
+ * The buffer pointed to by @out should be at least 5*in_size bytes long.
+ */
+unsigned char *c_escape_copy_bytea(const unsigned char *in, int in_size, unsigned char *out)
+{
+    while (in_size-- > 0) {
+        unsigned char c = *in++;
+
+        if (c == '\\') {
+            /* Escape backslash twice, once for BYTEA, and again for COPY FROM. */
+            *out++ = '\\';
+            *out++ = '\\';
+            *out++ = '\\';
+            *out++ = '\\';
+        } else if (c >= 32 && c <= 126) {
+            /*
+             * Printable characters (except backslash) are subject to neither
+             * BYTEA escaping nor COPY FROM escaping.
+             */
+            *out++ = c;
+        } else {
+            /*
+             * Escape using octal format.  This consists of two backslashes
+             * (single backslash, escaped for COPY FROM) followed by three
+             * digits [0-7].
+             *
+             * We can't use letter escapes \t, \n, \r because:
+             *
+             *  * The BYTEA input function doesn't understand letter escapes.
+             *
+             *  * We could use only one backslash so BYTEA sees the literal
+             *    octet values of 9, 10, and 13.  However, we're escaping other
+             *    non-printable characters for BYTEA; why give 9, 10, and 13
+             *    special treatment?
+             */
+            *out++ = '\\';
+            *out++ = '\\';
+            *out++ = '0' + ((c >> 6) & 0x7);
+            *out++ = '0' + ((c >> 3) & 0x7);
+            *out++ = '0' + (c & 0x7);
+        }
+    }
+
+    return out;
+}
diff --git a/postgresql-libpq.cabal b/postgresql-libpq.cabal
@@ -18,11 +18,20 @@ Copyright:           (c) 2010 Grant Monroe
                      (c) 2011 Leon P Smith
 Category:            Database
 Build-type:          Custom
--- Extra-source-files:
 Cabal-version:       >=1.8
+
+Extra-source-files:
+    testing/copy-from.hs
+    testing/copy-from-example.hs
+    testing/run-copy-from
+    testing/run-copy-from.expected
+
 Library
   Exposed-modules:     Database.PostgreSQL.LibPQ
 
+  C-Sources:
+    cbits/escape-copy.c
+
   Build-depends:       base >= 4 && < 5
                      , bytestring
 

diff --git a/testing/copy-from-example.hs b/testing/copy-from-example.hs
@@ -0,0 +1,39 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+import Control.Monad (forM_)
+import Database.PostgreSQL.LibPQ
+
+main :: IO ()
+main = do
+    conn <- connectdb ""
+
+    -- Create a temporary table for testing
+    Just result <- exec conn "CREATE TEMPORARY TABLE foo (a INT, b TEXT)"
+    CommandOk <- resultStatus result
+
+    -- Put the connection in the COPY_IN state
+    -- by executing a COPY ... FROM query.
+    Just result <- exec conn "COPY foo (a, b) FROM stdin"
+    CopyIn <- resultStatus result
+
+    -- Send a couple lines of COPY data
+    CopyOk <- putCopyRow conn [Just ("1", Text), Just ("one", Text)]
+    CopyOk <- putCopyRow conn [Just ("2", Text), Nothing]
+
+    -- Send end-of-data indication
+    CopyOk <- putCopyEnd conn Nothing
+
+    -- Get the final result status of the copy command.
+    Just result <- getResult conn
+    CommandOk <- resultStatus result
+
+    -- Retrieve the rows and print them
+    Just result <- exec conn "SELECT * FROM foo"
+    TuplesOk <- resultStatus result
+    n <- ntuples result
+    forM_ [0..n-1] $ \i -> do
+        c0 <- getvalue result i 0
+        c1 <- getvalue result i 1
+        putStrLn $ show c0 ++ "\t" ++ show c1
+
+    finish conn