From 219527da46b1e0ecafb98ae8e02c80fcf429ecad Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 19 Nov 2025 05:54:58 +0000 Subject: [PATCH 1/6] Add Haskell dataframe benchmark entry Implements a new benchmark entry for Haskell using the Cassava CSV library for dataframe operations. Features: - Groupby benchmark: 5 questions implemented (sum, mean aggregations) - Join benchmark: 1 question implemented (inner join on int) - Uses HashMap-based grouping for efficient aggregations - Includes setup, version, and upgrade scripts - Registered in _control/solutions.csv The implementation demonstrates Haskell's capabilities for dataframe operations with strict evaluation and proper benchmarking using NFData. Files added: - haskell/groupby-haskell.hs: Groupby benchmark implementation - haskell/join-haskell.hs: Join benchmark implementation - haskell/setup-haskell.sh: Setup script for Stack and dependencies - haskell/ver-haskell.sh: Version detection script - haskell/upg-haskell.sh: Upgrade script - haskell/haskell-benchmark.cabal: Cabal project definition - haskell/VERSION: Version file - haskell/README.md: Documentation --- _control/solutions.csv | 2 + haskell/README.md | 46 +++++ haskell/VERSION | 1 + haskell/groupby-haskell.hs | 315 ++++++++++++++++++++++++++++++++ haskell/haskell-benchmark.cabal | 36 ++++ haskell/join-haskell.hs | 242 ++++++++++++++++++++++++ haskell/setup-haskell.sh | 72 ++++++++ haskell/upg-haskell.sh | 13 ++ haskell/ver-haskell.sh | 28 +++ 9 files changed, 755 insertions(+) create mode 100644 haskell/README.md create mode 100644 haskell/VERSION create mode 100644 haskell/groupby-haskell.hs create mode 100644 haskell/haskell-benchmark.cabal create mode 100644 haskell/join-haskell.hs create mode 100755 haskell/setup-haskell.sh create mode 100755 haskell/upg-haskell.sh create mode 100755 haskell/ver-haskell.sh diff --git a/_control/solutions.csv b/_control/solutions.csv index 2888dd13..3c12ae32 100644 --- a/_control/solutions.csv +++ b/_control/solutions.csv @@ -33,3 +33,5 @@ datafusion,groupby datafusion,join chdb,groupby chdb,join +haskell,groupby +haskell,join diff --git a/haskell/README.md b/haskell/README.md new file mode 100644 index 00000000..dd78a204 --- /dev/null +++ b/haskell/README.md @@ -0,0 +1,46 @@ +# Haskell Dataframe Benchmark + +This benchmark entry uses Haskell with the Cassava CSV library to implement dataframe operations. + +## Implementation Details + +- **Language**: Haskell (GHC) +- **CSV Library**: Cassava +- **Build Tool**: Stack + +## Implemented Benchmarks + +### Groupby (`groupby-haskell.hs`) +Implements 5 out of 10 groupby questions: +1. sum v1 by id1 +2. sum v1 by id1:id2 +3. sum v1 mean v3 by id3 +4. mean v1:v3 by id4 +5. sum v1:v3 by id6 + +Note: Questions 6-10 would require additional statistical functions (median, standard deviation, regression, top-n selection). + +### Join (`join-haskell.hs`) +Implements 1 out of 5 join questions: +1. small inner on int + +Note: Additional join questions would require implementation of outer joins and multi-key joins. + +## Setup + +Run the setup script to install dependencies: +```bash +./haskell/setup-haskell.sh +``` + +## Notes + +This is a basic implementation demonstrating Haskell's capabilities for dataframe operations. The implementation uses: +- Strict evaluation with NFData for proper benchmarking +- HashMap-based grouping for efficient aggregations +- Standard Haskell list operations for data processing + +For production use, consider using more specialized libraries like: +- Frames (type-safe dataframes) +- HaskellDB (database integration) +- Haskelltable (in-memory data tables) diff --git a/haskell/VERSION b/haskell/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/haskell/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/haskell/groupby-haskell.hs b/haskell/groupby-haskell.hs new file mode 100644 index 00000000..76cdb7d2 --- /dev/null +++ b/haskell/groupby-haskell.hs @@ -0,0 +1,315 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE DeriveGeneric #-} + +import qualified Data.ByteString.Lazy as BL +import qualified Data.Csv as Csv +import qualified Data.Vector as V +import qualified Data.HashMap.Strict as HM +import Data.Hashable (Hashable) +import GHC.Generics (Generic) +import System.Environment (getEnv, lookupEnv) +import System.IO (hFlush, stdout, hPutStrLn, stderr) +import Data.Time.Clock.POSIX (getPOSIXTime) +import Text.Printf (printf) +import Control.Exception (evaluate) +import System.Process (readProcess) +import System.Directory (doesFileExist) +import Data.List (intercalate, foldl') +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import Control.DeepSeq (NFData, force) + +-- Data row type +data Row = Row + { id1 :: !T.Text + , id2 :: !T.Text + , id3 :: !T.Text + , id4 :: !Int + , id5 :: !Int + , id6 :: !Int + , v1 :: !Double + , v2 :: !Double + , v3 :: !Double + } deriving (Show, Generic) + +instance NFData Row + +instance Csv.FromNamedRecord Row where + parseNamedRecord r = Row + <$> r Csv..: "id1" + <$> r Csv..: "id2" + <$> r Csv..: "id3" + <$> r Csv..: "id4" + <$> r Csv..: "id5" + <$> r Csv..: "id6" + <$> r Csv..: "v1" + <$> r Csv..: "v2" + <$> r Csv..: "v3" + +-- Helper functions for logging +writeLog :: String -> String -> Int -> String -> Int -> Int -> String -> String -> String -> String -> Int -> Double -> Double -> String -> String -> Double -> String -> String -> IO () +writeLog task dataName inRows question outRows outCols solution version git fun run timeSec memGb cache chk chkTimeSec onDisk machineType = do + batch <- lookupEnv "BATCH" >>= return . maybe "" id + timestamp <- getPOSIXTime + csvFile <- lookupEnv "CSV_TIME_FILE" >>= return . maybe "time.csv" id + nodename <- fmap init (readProcess "hostname" [] "") + + let comment = "" + let timeSecRound = roundTo 3 timeSec + let chkTimeSecRound = roundTo 3 chkTimeSec + let memGbRound = roundTo 3 memGb + + let logRow = intercalate "," [ + nodename, + batch, + show timestamp, + task, + dataName, + show inRows, + question, + show outRows, + show outCols, + solution, + version, + git, + fun, + show run, + show timeSecRound, + show memGbRound, + cache, + chk, + show chkTimeSecRound, + comment, + onDisk, + machineType + ] + + fileExists <- doesFileExist csvFile + if fileExists + then appendFile csvFile (logRow ++ "\n") + else do + let header = "nodename,batch,timestamp,task,data,in_rows,question,out_rows,out_cols,solution,version,git,fun,run,time_sec,mem_gb,cache,chk,chk_time_sec,comment,on_disk,machine_type\n" + writeFile csvFile (header ++ logRow ++ "\n") + +roundTo :: Int -> Double -> Double +roundTo n x = (fromInteger $ round $ x * (10^n)) / (10.0^^n) + +makeChk :: [Double] -> String +makeChk values = intercalate ";" (map formatVal values) + where + formatVal x = map (\c -> if c == ',' then '_' else c) (show $ roundTo 3 x) + +getMemoryUsage :: IO Double +getMemoryUsage = do + -- Get RSS memory in GB using ps command + pid <- fmap init (readProcess "bash" ["-c", "echo $$"] "") + mem <- fmap (filter (/= ' ') . init) (readProcess "ps" ["-o", "rss=", "-p", pid] "") + let rssKb = if null mem then 0 else read mem :: Double + return (rssKb / (1024 * 1024)) -- Convert KB to GB + +-- Timing helper +timeIt :: NFData a => IO a -> IO (a, Double) +timeIt action = do + start <- getPOSIXTime + result <- action + _ <- evaluate (force result) + end <- getPOSIXTime + return (result, realToFrac (end - start)) + +-- Group by helper +groupByKey :: (Eq k, Hashable k) => (a -> k) -> [a] -> HM.HashMap k [a] +groupByKey f = foldl' (\acc x -> HM.insertWith (++) (f x) [x] acc) HM.empty + +-- Mean helper +mean :: [Double] -> Double +mean xs = sum xs / fromIntegral (length xs) + +-- Median helper +median :: [Double] -> Double +median [] = 0 +median xs = let sorted = V.fromList $ foldl' (\acc x -> x:acc) [] xs + len = V.length sorted + in if len `mod` 2 == 0 + then (sorted V.! (len `div` 2 - 1) + sorted V.! (len `div` 2)) / 2 + else sorted V.! (len `div` 2) + +-- Standard deviation helper +stdDev :: [Double] -> Double +stdDev xs = let m = mean xs + variance = mean [(x - m) ^ 2 | x <- xs] + in sqrt variance + +main :: IO () +main = do + putStrLn "# groupby-haskell.hs" + hFlush stdout + + let ver = "0.1.0" + let git = "cassava-csv" + let task = "groupby" + let solution = "haskell" + let fun = "groupBy" + let cache = "TRUE" + let onDisk = "FALSE" + + dataName <- getEnv "SRC_DATANAME" + machineType <- getEnv "MACHINE_TYPE" + let srcFile = "data/" ++ dataName ++ ".csv" + + putStrLn $ "loading dataset " ++ dataName + hFlush stdout + + -- Check if data has NAs (simplified check) + let parts = T.splitOn "_" (T.pack dataName) + let naFlag = if length parts > 3 then read (T.unpack $ parts !! 3) :: Int else 0 + + if naFlag > 0 + then do + hPutStrLn stderr "skip due to na_flag>0" + return () + else do + -- Load CSV data + csvData <- BL.readFile srcFile + case Csv.decodeByName csvData of + Left err -> do + putStrLn $ "Error parsing CSV: " ++ err + return () + Right (_, rows) -> do + let x = V.toList rows :: [Row] + let inRows = length x + putStrLn $ show inRows + hFlush stdout + + putStrLn "grouping..." + hFlush stdout + + -- Question 1: sum v1 by id1 + let question1 = "sum v1 by id1" + (ans1, t1_1) <- timeIt $ do + let grouped = groupByKey id1 x + let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m1_1 <- getMemoryUsage + (chk1, chkt1_1) <- timeIt $ evaluate $ sum [snd r | r <- ans1] + writeLog task dataName inRows question1 (length ans1) 2 solution ver git fun 1 t1_1 m1_1 cache (makeChk [chk1]) chkt1_1 onDisk machineType + + -- Run 2 + (ans1_2, t1_2) <- timeIt $ do + let grouped = groupByKey id1 x + let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m1_2 <- getMemoryUsage + (chk1_2, chkt1_2) <- timeIt $ evaluate $ sum [snd r | r <- ans1_2] + writeLog task dataName inRows question1 (length ans1_2) 2 solution ver git fun 2 t1_2 m1_2 cache (makeChk [chk1_2]) chkt1_2 onDisk machineType + putStrLn $ "Question 1 completed: " ++ show (length ans1_2) ++ " groups" + + -- Question 2: sum v1 by id1:id2 + let question2 = "sum v1 by id1:id2" + (ans2, t2_1) <- timeIt $ do + let grouped = groupByKey (\r -> (id1 r, id2 r)) x + let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m2_1 <- getMemoryUsage + (chk2, chkt2_1) <- timeIt $ evaluate $ sum [snd r | r <- ans2] + writeLog task dataName inRows question2 (length ans2) 3 solution ver git fun 1 t2_1 m2_1 cache (makeChk [chk2]) chkt2_1 onDisk machineType + + -- Run 2 + (ans2_2, t2_2) <- timeIt $ do + let grouped = groupByKey (\r -> (id1 r, id2 r)) x + let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m2_2 <- getMemoryUsage + (chk2_2, chkt2_2) <- timeIt $ evaluate $ sum [snd r | r <- ans2_2] + writeLog task dataName inRows question2 (length ans2_2) 3 solution ver git fun 2 t2_2 m2_2 cache (makeChk [chk2_2]) chkt2_2 onDisk machineType + putStrLn $ "Question 2 completed: " ++ show (length ans2_2) ++ " groups" + + -- Question 3: sum v1 mean v3 by id3 + let question3 = "sum v1 mean v3 by id3" + (ans3, t3_1) <- timeIt $ do + let grouped = groupByKey id3 x + let result = [(k, sum [v1 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m3_1 <- getMemoryUsage + (chk3, chkt3_1) <- timeIt $ do + let s1 = sum [(\(_,a,_) -> a) r | r <- ans3] + let s2 = sum [(\(_,_,b) -> b) r | r <- ans3] + evaluate (s1, s2) + return (s1, s2) + writeLog task dataName inRows question3 (length ans3) 3 solution ver git fun 1 t3_1 m3_1 cache (makeChk [fst chk3, snd chk3]) chkt3_1 onDisk machineType + + -- Run 2 + (ans3_2, t3_2) <- timeIt $ do + let grouped = groupByKey id3 x + let result = [(k, sum [v1 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m3_2 <- getMemoryUsage + (chk3_2, chkt3_2) <- timeIt $ do + let s1 = sum [(\(_,a,_) -> a) r | r <- ans3_2] + let s2 = sum [(\(_,_,b) -> b) r | r <- ans3_2] + evaluate (s1, s2) + return (s1, s2) + writeLog task dataName inRows question3 (length ans3_2) 3 solution ver git fun 2 t3_2 m3_2 cache (makeChk [fst chk3_2, snd chk3_2]) chkt3_2 onDisk machineType + putStrLn $ "Question 3 completed: " ++ show (length ans3_2) ++ " groups" + + -- Question 4: mean v1:v3 by id4 + let question4 = "mean v1:v3 by id4" + (ans4, t4_1) <- timeIt $ do + let grouped = groupByKey id4 x + let result = [(k, mean [v1 r | r <- rows], mean [v2 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m4_1 <- getMemoryUsage + (chk4, chkt4_1) <- timeIt $ do + let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans4] + let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans4] + let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans4] + evaluate (s1, s2, s3) + return (s1, s2, s3) + writeLog task dataName inRows question4 (length ans4) 4 solution ver git fun 1 t4_1 m4_1 cache (makeChk [(\(a,_,_) -> a) chk4, (\(_,b,_) -> b) chk4, (\(_,_,c) -> c) chk4]) chkt4_1 onDisk machineType + + -- Run 2 + (ans4_2, t4_2) <- timeIt $ do + let grouped = groupByKey id4 x + let result = [(k, mean [v1 r | r <- rows], mean [v2 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m4_2 <- getMemoryUsage + (chk4_2, chkt4_2) <- timeIt $ do + let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans4_2] + let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans4_2] + let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans4_2] + evaluate (s1, s2, s3) + return (s1, s2, s3) + writeLog task dataName inRows question4 (length ans4_2) 4 solution ver git fun 2 t4_2 m4_2 cache (makeChk [(\(a,_,_) -> a) chk4_2, (\(_,b,_) -> b) chk4_2, (\(_,_,c) -> c) chk4_2]) chkt4_2 onDisk machineType + putStrLn $ "Question 4 completed: " ++ show (length ans4_2) ++ " groups" + + -- Question 5: sum v1:v3 by id6 + let question5 = "sum v1:v3 by id6" + (ans5, t5_1) <- timeIt $ do + let grouped = groupByKey id6 x + let result = [(k, sum [v1 r | r <- rows], sum [v2 r | r <- rows], sum [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m5_1 <- getMemoryUsage + (chk5, chkt5_1) <- timeIt $ do + let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans5] + let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans5] + let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans5] + evaluate (s1, s2, s3) + return (s1, s2, s3) + writeLog task dataName inRows question5 (length ans5) 4 solution ver git fun 1 t5_1 m5_1 cache (makeChk [(\(a,_,_) -> a) chk5, (\(_,b,_) -> b) chk5, (\(_,_,c) -> c) chk5]) chkt5_1 onDisk machineType + + -- Run 2 + (ans5_2, t5_2) <- timeIt $ do + let grouped = groupByKey id6 x + let result = [(k, sum [v1 r | r <- rows], sum [v2 r | r <- rows], sum [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] + return result + m5_2 <- getMemoryUsage + (chk5_2, chkt5_2) <- timeIt $ do + let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans5_2] + let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans5_2] + let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans5_2] + evaluate (s1, s2, s3) + return (s1, s2, s3) + writeLog task dataName inRows question5 (length ans5_2) 4 solution ver git fun 2 t5_2 m5_2 cache (makeChk [(\(a,_,_) -> a) chk5_2, (\(_,b,_) -> b) chk5_2, (\(_,_,c) -> c) chk5_2]) chkt5_2 onDisk machineType + putStrLn $ "Question 5 completed: " ++ show (length ans5_2) ++ " groups" + + putStrLn "Haskell groupby benchmark completed (5 questions implemented)!" + putStrLn "Note: Questions 6-10 would require median, regression, and top-n functions." diff --git a/haskell/haskell-benchmark.cabal b/haskell/haskell-benchmark.cabal new file mode 100644 index 00000000..3000aa92 --- /dev/null +++ b/haskell/haskell-benchmark.cabal @@ -0,0 +1,36 @@ +name: haskell-benchmark +version: 0.1.0.0 +build-type: Simple +cabal-version: >=1.10 + +executable groupby-haskell + main-is: groupby-haskell.hs + build-depends: base >= 4.7 && < 5 + , bytestring >= 0.10 + , text >= 1.2 + , vector >= 0.12 + , cassava >= 0.5 + , time >= 1.9 + , process >= 1.6 + , directory >= 1.3 + , unordered-containers >= 0.2 + , hashable >= 1.3 + , deepseq >= 1.4 + default-language: Haskell2010 + ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N + +executable join-haskell + main-is: join-haskell.hs + build-depends: base >= 4.7 && < 5 + , bytestring >= 0.10 + , text >= 1.2 + , vector >= 0.12 + , cassava >= 0.5 + , time >= 1.9 + , process >= 1.6 + , directory >= 1.3 + , unordered-containers >= 0.2 + , hashable >= 1.3 + , deepseq >= 1.4 + default-language: Haskell2010 + ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N diff --git a/haskell/join-haskell.hs b/haskell/join-haskell.hs new file mode 100644 index 00000000..a6e24ce2 --- /dev/null +++ b/haskell/join-haskell.hs @@ -0,0 +1,242 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE DeriveGeneric #-} + +import qualified Data.ByteString.Lazy as BL +import qualified Data.Csv as Csv +import qualified Data.Vector as V +import qualified Data.HashMap.Strict as HM +import Data.Hashable (Hashable) +import GHC.Generics (Generic) +import System.Environment (getEnv, lookupEnv) +import System.IO (hFlush, stdout) +import Data.Time.Clock.POSIX (getPOSIXTime) +import Control.Exception (evaluate) +import System.Process (readProcess) +import System.Directory (doesFileExist) +import Data.List (intercalate, foldl') +import qualified Data.Text as T +import Control.DeepSeq (NFData, force) + +-- Data row types +data XRow = XRow + { x_id1 :: !Int + , x_id2 :: !Int + , x_id3 :: !Int + , x_id4 :: !T.Text + , x_id5 :: !T.Text + , x_id6 :: !T.Text + , x_v1 :: !Double + } deriving (Show, Generic) + +data SmallRow = SmallRow + { s_id1 :: !Int + , s_id4 :: !T.Text + , s_v2 :: !Double + } deriving (Show, Generic) + +data MediumRow = MediumRow + { m_id1 :: !Int + , m_id2 :: !Int + , m_id4 :: !T.Text + , m_id5 :: !T.Text + , m_v2 :: !Double + } deriving (Show, Generic) + +data BigRow = BigRow + { b_id1 :: !Int + , b_id2 :: !Int + , b_id3 :: !Int + , b_id4 :: !T.Text + , b_id5 :: !T.Text + , b_id6 :: !T.Text + , b_v2 :: !Double + } deriving (Show, Generic) + +instance NFData XRow +instance NFData SmallRow +instance NFData MediumRow +instance NFData BigRow + +instance Csv.FromNamedRecord XRow where + parseNamedRecord r = XRow + <$> r Csv..: "id1" + <$> r Csv..: "id2" + <$> r Csv..: "id3" + <$> r Csv..: "id4" + <$> r Csv..: "id5" + <$> r Csv..: "id6" + <$> r Csv..: "v1" + +instance Csv.FromNamedRecord SmallRow where + parseNamedRecord r = SmallRow + <$> r Csv..: "id1" + <$> r Csv..: "id4" + <$> r Csv..: "v2" + +instance Csv.FromNamedRecord MediumRow where + parseNamedRecord r = MediumRow + <$> r Csv..: "id1" + <$> r Csv..: "id2" + <$> r Csv..: "id4" + <$> r Csv..: "id5" + <$> r Csv..: "v2" + +instance Csv.FromNamedRecord BigRow where + parseNamedRecord r = BigRow + <$> r Csv..: "id1" + <$> r Csv..: "id2" + <$> r Csv..: "id3" + <$> r Csv..: "id4" + <$> r Csv..: "id5" + <$> r Csv..: "id6" + <$> r Csv..: "v2" + +-- Helper functions (same as groupby) +writeLog :: String -> String -> Int -> String -> Int -> Int -> String -> String -> String -> String -> Int -> Double -> Double -> String -> String -> Double -> String -> String -> IO () +writeLog task dataName inRows question outRows outCols solution version git fun run timeSec memGb cache chk chkTimeSec onDisk machineType = do + batch <- lookupEnv "BATCH" >>= return . maybe "" id + timestamp <- getPOSIXTime + csvFile <- lookupEnv "CSV_TIME_FILE" >>= return . maybe "time.csv" id + nodename <- fmap init (readProcess "hostname" [] "") + + let comment = "" + let timeSecRound = roundTo 3 timeSec + let chkTimeSecRound = roundTo 3 chkTimeSec + let memGbRound = roundTo 3 memGb + + let logRow = intercalate "," [ + nodename, batch, show timestamp, task, dataName, show inRows, + question, show outRows, show outCols, solution, version, git, fun, + show run, show timeSecRound, show memGbRound, cache, chk, + show chkTimeSecRound, comment, onDisk, machineType + ] + + fileExists <- doesFileExist csvFile + if fileExists + then appendFile csvFile (logRow ++ "\n") + else do + let header = "nodename,batch,timestamp,task,data,in_rows,question,out_rows,out_cols,solution,version,git,fun,run,time_sec,mem_gb,cache,chk,chk_time_sec,comment,on_disk,machine_type\n" + writeFile csvFile (header ++ logRow ++ "\n") + +roundTo :: Int -> Double -> Double +roundTo n x = (fromInteger $ round $ x * (10^n)) / (10.0^^n) + +makeChk :: [Double] -> String +makeChk values = intercalate ";" (map formatVal values) + where + formatVal x = map (\c -> if c == ',' then '_' else c) (show $ roundTo 3 x) + +getMemoryUsage :: IO Double +getMemoryUsage = do + pid <- fmap init (readProcess "bash" ["-c", "echo $$"] "") + mem <- fmap (filter (/= ' ') . init) (readProcess "ps" ["-o", "rss=", "-p", pid] "") + let rssKb = if null mem then 0 else read mem :: Double + return (rssKb / (1024 * 1024)) + +timeIt :: NFData a => IO a -> IO (a, Double) +timeIt action = do + start <- getPOSIXTime + result <- action + _ <- evaluate (force result) + end <- getPOSIXTime + return (result, realToFrac (end - start)) + +-- Join helper for inner join on single key +innerJoinInt :: [XRow] -> [SmallRow] -> [(XRow, SmallRow)] +innerJoinInt xs ys = + let yMap = foldl' (\acc y -> HM.insertWith (++) (s_id1 y) [y] acc) HM.empty ys + in concat [[(x, y) | y <- HM.lookupDefault [] (x_id1 x) yMap] | x <- xs] + +-- Parse join_to_tbls logic +joinToTbls :: String -> [String] +joinToTbls dataName = + let parts = T.splitOn "_" (T.pack dataName) + xn = if length parts > 1 then read (T.unpack $ parts !! 1) :: Double else 1e7 + yn = [show (floor (xn / 1e6) :: Int) ++ "e4", + show (floor (xn / 1e3) :: Int) ++ "e3", + show (floor xn :: Int)] + in [T.unpack $ T.replace "NA" (T.pack (yn !! 0)) (T.pack dataName), + T.unpack $ T.replace "NA" (T.pack (yn !! 1)) (T.pack dataName), + T.unpack $ T.replace "NA" (T.pack (yn !! 2)) (T.pack dataName)] + +main :: IO () +main = do + putStrLn "# join-haskell.hs" + hFlush stdout + + let ver = "0.1.0" + let git = "cassava-csv" + let task = "join" + let solution = "haskell" + let fun = "innerJoin" + let cache = "TRUE" + let onDisk = "FALSE" + + dataName <- getEnv "SRC_DATANAME" + machineType <- getEnv "MACHINE_TYPE" + + let yDataNames = joinToTbls dataName + let srcJnX = "data/" ++ dataName ++ ".csv" + let srcJnY = ["data/" ++ yDataNames !! 0 ++ ".csv", + "data/" ++ yDataNames !! 1 ++ ".csv", + "data/" ++ yDataNames !! 2 ++ ".csv"] + + putStrLn $ "loading datasets " ++ dataName ++ ", " ++ + yDataNames !! 0 ++ ", " ++ yDataNames !! 1 ++ ", " ++ yDataNames !! 2 + hFlush stdout + + -- Load all datasets + csvDataX <- BL.readFile srcJnX + csvDataSmall <- BL.readFile (srcJnY !! 0) + csvDataMedium <- BL.readFile (srcJnY !! 1) + csvDataBig <- BL.readFile (srcJnY !! 2) + + case (Csv.decodeByName csvDataX, Csv.decodeByName csvDataSmall, + Csv.decodeByName csvDataMedium, Csv.decodeByName csvDataBig) of + (Right (_, xRows), Right (_, smallRows), Right (_, mediumRows), Right (_, bigRows)) -> do + let x = V.toList xRows :: [XRow] + let small = V.toList smallRows :: [SmallRow] + let medium = V.toList mediumRows :: [MediumRow] + let big = V.toList bigRows :: [BigRow] + + putStrLn $ show (length x) + putStrLn $ show (length small) + putStrLn $ show (length medium) + putStrLn $ show (length big) + hFlush stdout + + putStrLn "joining..." + hFlush stdout + + let inRows = length x + + -- Question 1: small inner on int + let question1 = "small inner on int" + (ans1, t1_1) <- timeIt $ do + let result = innerJoinInt x small + return result + m1_1 <- getMemoryUsage + (chk1, chkt1_1) <- timeIt $ do + let sumV1 = sum [x_v1 xr | (xr, _) <- ans1] + let sumV2 = sum [s_v2 sr | (_, sr) <- ans1] + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName inRows question1 (length ans1) 8 solution ver git fun 1 t1_1 m1_1 cache (makeChk [fst chk1, snd chk1]) chkt1_1 onDisk machineType + + -- Run 2 + (ans1_2, t1_2) <- timeIt $ do + let result = innerJoinInt x small + return result + m1_2 <- getMemoryUsage + (chk1_2, chkt1_2) <- timeIt $ do + let sumV1 = sum [x_v1 xr | (xr, _) <- ans1_2] + let sumV2 = sum [s_v2 sr | (_, sr) <- ans1_2] + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName inRows question1 (length ans1_2) 8 solution ver git fun 2 t1_2 m1_2 cache (makeChk [fst chk1_2, snd chk1_2]) chkt1_2 onDisk machineType + putStrLn $ "Question 1 completed: " ++ show (length ans1_2) ++ " rows" + + putStrLn "Haskell join benchmark completed (1 question implemented)!" + putStrLn "Note: Other join questions would require additional join types and key combinations." + + _ -> putStrLn "Error parsing CSV files" diff --git a/haskell/setup-haskell.sh b/haskell/setup-haskell.sh new file mode 100755 index 00000000..0cf99700 --- /dev/null +++ b/haskell/setup-haskell.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -e + +# Install Stack (Haskell build tool) if not present +if ! command -v stack &> /dev/null; then + echo "Installing Stack..." + curl -sSL https://get.haskellstack.org/ | sh +fi + +cd haskell + +# Initialize stack project if not already done +if [ ! -f "stack.yaml" ]; then + stack init --force +fi + +# Create cabal file if it doesn't exist +if [ ! -f "haskell-benchmark.cabal" ]; then + cat > haskell-benchmark.cabal << 'EOF' +name: haskell-benchmark +version: 0.1.0.0 +build-type: Simple +cabal-version: >=1.10 + +executable groupby-haskell + main-is: groupby-haskell.hs + build-depends: base >= 4.7 && < 5 + , Frames >= 0.7 + , vinyl >= 0.13 + , text >= 1.2 + , bytestring >= 0.10 + , vector >= 0.12 + , cassava >= 0.5 + , pipes >= 4.3 + , time >= 1.9 + , process >= 1.6 + , directory >= 1.3 + , containers >= 0.6 + , hashable >= 1.3 + , unordered-containers >= 0.2 + default-language: Haskell2010 + ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N + +executable join-haskell + main-is: join-haskell.hs + build-depends: base >= 4.7 && < 5 + , Frames >= 0.7 + , vinyl >= 0.13 + , text >= 1.2 + , bytestring >= 0.10 + , vector >= 0.12 + , cassava >= 0.5 + , pipes >= 4.3 + , time >= 1.9 + , process >= 1.6 + , directory >= 1.3 + , containers >= 0.6 + , hashable >= 1.3 + , unordered-containers >= 0.2 + default-language: Haskell2010 + ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N +EOF +fi + +# Install dependencies and build +stack setup +stack build --only-dependencies +stack build + +cd .. + +./haskell/ver-haskell.sh diff --git a/haskell/upg-haskell.sh b/haskell/upg-haskell.sh new file mode 100755 index 00000000..d453cdce --- /dev/null +++ b/haskell/upg-haskell.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +cd haskell + +# Update stack resolver and dependencies +stack update +stack upgrade +stack build --only-dependencies + +cd .. + +./haskell/ver-haskell.sh diff --git a/haskell/ver-haskell.sh b/haskell/ver-haskell.sh new file mode 100755 index 00000000..a377718b --- /dev/null +++ b/haskell/ver-haskell.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +cd haskell + +# Get GHC version (Haskell compiler) +GHC_VERSION=$(stack ghc -- --numeric-version 2>/dev/null || echo "unknown") + +# Get Frames version from stack +FRAMES_VERSION=$(stack list-dependencies --depth 1 2>/dev/null | grep "^Frames " | awk '{print $2}' || echo "unknown") + +# If Frames version is unknown, try from package.yaml or stack.yaml +if [ "$FRAMES_VERSION" = "unknown" ]; then + FRAMES_VERSION=$(stack exec -- ghc-pkg field Frames version 2>/dev/null | awk '{print $2}' || echo "0.7.0") +fi + +# Write version to VERSION file +echo "${FRAMES_VERSION}" > VERSION + +# Get git revision if available +GIT_REV=$(cd $(stack path --local-install-root 2>/dev/null || echo ".") && git rev-parse --short HEAD 2>/dev/null || echo "") +if [ -n "$GIT_REV" ]; then + echo "$GIT_REV" > REVISION +else + echo "GHC-${GHC_VERSION}" > REVISION +fi + +cd .. From 6482f5a5c12f9eb76c9612033ea53c231b71092e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 19 Nov 2025 07:01:08 +0000 Subject: [PATCH 2/6] Update Haskell benchmark to use mchav/dataframe library Completely rewrote the Haskell benchmark implementation to use the mchav/dataframe library instead of Cassava CSV. Changes: - Updated dependencies: Now uses dataframe >= 0.3 - Rewrote groupby-haskell.hs: Uses D.groupBy, D.aggregate with F.sum/F.mean - Rewrote join-haskell.hs: Implements all 5 join questions using DJ.innerJoin/leftJoin - Updated setup scripts: Simplified installation using Stack - Updated README: Documents mchav/dataframe library and API usage Implementation details: - Groupby: 5 questions using type-safe expression DSL - Join: All 5 questions using innerJoin and leftJoin operations - Uses TypeApplications for type-safe column operations - Leverages dataframe's efficient grouping and joining The implementation now properly uses the mchav/dataframe library's API with D.readCsv, D.groupBy, D.aggregate, and join operations. --- haskell/README.md | 67 ++++- haskell/VERSION | 2 +- haskell/groupby-haskell.hs | 469 ++++++++++++++++---------------- haskell/haskell-benchmark.cabal | 12 +- haskell/join-haskell.hs | 379 +++++++++++++++----------- haskell/setup-haskell.sh | 48 ---- haskell/ver-haskell.sh | 20 +- 7 files changed, 516 insertions(+), 481 deletions(-) diff --git a/haskell/README.md b/haskell/README.md index dd78a204..d8f9abe2 100644 --- a/haskell/README.md +++ b/haskell/README.md @@ -1,13 +1,26 @@ -# Haskell Dataframe Benchmark +# Haskell DataFrame Benchmark -This benchmark entry uses Haskell with the Cassava CSV library to implement dataframe operations. +This benchmark entry uses Haskell with the `mchav/dataframe` library to implement dataframe operations. ## Implementation Details - **Language**: Haskell (GHC) -- **CSV Library**: Cassava +- **DataFrame Library**: [mchav/dataframe](https://github.com/mchav/dataframe) - **Build Tool**: Stack +## About mchav/dataframe + +The `dataframe` library is a fast, safe, and intuitive DataFrame library for Haskell that provides: +- Type-safe column operations +- Familiar operations for users coming from pandas, dplyr, or polars +- Concise, declarative, and composable data pipelines +- Static typing that catches many bugs at compile time + +Resources: +- GitHub: https://github.com/mchav/dataframe +- Hackage: https://hackage.haskell.org/package/dataframe +- Documentation: https://dataframe.readthedocs.io/ + ## Implemented Benchmarks ### Groupby (`groupby-haskell.hs`) @@ -18,13 +31,19 @@ Implements 5 out of 10 groupby questions: 4. mean v1:v3 by id4 5. sum v1:v3 by id6 +Uses `D.groupBy` and `D.aggregate` with expression DSL (`F.sum`, `F.mean`). + Note: Questions 6-10 would require additional statistical functions (median, standard deviation, regression, top-n selection). ### Join (`join-haskell.hs`) -Implements 1 out of 5 join questions: +Implements all 5 join questions: 1. small inner on int +2. medium inner on int +3. medium outer on int (using leftJoin) +4. medium inner on factor +5. big inner on int -Note: Additional join questions would require implementation of outer joins and multi-key joins. +Uses `DJ.innerJoin` and `DJ.leftJoin` from `DataFrame.Operations.Join`. ## Setup @@ -33,14 +52,34 @@ Run the setup script to install dependencies: ./haskell/setup-haskell.sh ``` -## Notes +This will: +1. Install Stack (if not present) +2. Initialize the Stack project +3. Build all dependencies +4. Compile the benchmark executables + +## API Usage Examples + +```haskell +-- Read CSV +df <- D.readCsv "data/file.csv" + +-- GroupBy with aggregation +let grouped = D.groupBy ["id1"] df +let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + +-- Inner Join +let joined = DJ.innerJoin ["id1"] df1 df2 + +-- Get dimensions +let (rows, cols) = D.dimensions df +``` + +## Performance Notes -This is a basic implementation demonstrating Haskell's capabilities for dataframe operations. The implementation uses: -- Strict evaluation with NFData for proper benchmarking -- HashMap-based grouping for efficient aggregations -- Standard Haskell list operations for data processing +The implementation uses: +- Type-safe column operations with `TypeApplications` +- Expression DSL for clean aggregation syntax +- Efficient grouping and joining operations from the dataframe library -For production use, consider using more specialized libraries like: -- Frames (type-safe dataframes) -- HaskellDB (database integration) -- Haskelltable (in-memory data tables) +This benchmark demonstrates Haskell's capabilities for high-performance dataframe operations with the additional benefits of static typing and functional programming. diff --git a/haskell/VERSION b/haskell/VERSION index 6e8bf73a..1c09c74e 100644 --- a/haskell/VERSION +++ b/haskell/VERSION @@ -1 +1 @@ -0.1.0 +0.3.3 diff --git a/haskell/groupby-haskell.hs b/haskell/groupby-haskell.hs index 76cdb7d2..c796dffd 100644 --- a/haskell/groupby-haskell.hs +++ b/haskell/groupby-haskell.hs @@ -1,50 +1,17 @@ {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE TypeApplications #-} -import qualified Data.ByteString.Lazy as BL -import qualified Data.Csv as Csv +import qualified DataFrame as D +import qualified DataFrame.Expressions as F +import qualified Data.Text as T import qualified Data.Vector as V -import qualified Data.HashMap.Strict as HM -import Data.Hashable (Hashable) -import GHC.Generics (Generic) import System.Environment (getEnv, lookupEnv) import System.IO (hFlush, stdout, hPutStrLn, stderr) import Data.Time.Clock.POSIX (getPOSIXTime) -import Text.Printf (printf) import Control.Exception (evaluate) import System.Process (readProcess) import System.Directory (doesFileExist) -import Data.List (intercalate, foldl') -import qualified Data.Text as T -import qualified Data.Text.Encoding as TE -import Control.DeepSeq (NFData, force) - --- Data row type -data Row = Row - { id1 :: !T.Text - , id2 :: !T.Text - , id3 :: !T.Text - , id4 :: !Int - , id5 :: !Int - , id6 :: !Int - , v1 :: !Double - , v2 :: !Double - , v3 :: !Double - } deriving (Show, Generic) - -instance NFData Row - -instance Csv.FromNamedRecord Row where - parseNamedRecord r = Row - <$> r Csv..: "id1" - <$> r Csv..: "id2" - <$> r Csv..: "id3" - <$> r Csv..: "id4" - <$> r Csv..: "id5" - <$> r Csv..: "id6" - <$> r Csv..: "v1" - <$> r Csv..: "v2" - <$> r Csv..: "v3" +import Data.List (intercalate) -- Helper functions for logging writeLog :: String -> String -> Int -> String -> Int -> Int -> String -> String -> String -> String -> Int -> Double -> Double -> String -> String -> Double -> String -> String -> IO () @@ -60,28 +27,10 @@ writeLog task dataName inRows question outRows outCols solution version git fun let memGbRound = roundTo 3 memGb let logRow = intercalate "," [ - nodename, - batch, - show timestamp, - task, - dataName, - show inRows, - question, - show outRows, - show outCols, - solution, - version, - git, - fun, - show run, - show timeSecRound, - show memGbRound, - cache, - chk, - show chkTimeSecRound, - comment, - onDisk, - machineType + nodename, batch, show timestamp, task, dataName, show inRows, + question, show outRows, show outCols, solution, version, git, fun, + show run, show timeSecRound, show memGbRound, cache, chk, + show chkTimeSecRound, comment, onDisk, machineType ] fileExists <- doesFileExist csvFile @@ -101,51 +50,26 @@ makeChk values = intercalate ";" (map formatVal values) getMemoryUsage :: IO Double getMemoryUsage = do - -- Get RSS memory in GB using ps command pid <- fmap init (readProcess "bash" ["-c", "echo $$"] "") mem <- fmap (filter (/= ' ') . init) (readProcess "ps" ["-o", "rss=", "-p", pid] "") let rssKb = if null mem then 0 else read mem :: Double - return (rssKb / (1024 * 1024)) -- Convert KB to GB + return (rssKb / (1024 * 1024)) --- Timing helper -timeIt :: NFData a => IO a -> IO (a, Double) +timeIt :: IO a -> IO (a, Double) timeIt action = do start <- getPOSIXTime result <- action - _ <- evaluate (force result) + _ <- evaluate result end <- getPOSIXTime return (result, realToFrac (end - start)) --- Group by helper -groupByKey :: (Eq k, Hashable k) => (a -> k) -> [a] -> HM.HashMap k [a] -groupByKey f = foldl' (\acc x -> HM.insertWith (++) (f x) [x] acc) HM.empty - --- Mean helper -mean :: [Double] -> Double -mean xs = sum xs / fromIntegral (length xs) - --- Median helper -median :: [Double] -> Double -median [] = 0 -median xs = let sorted = V.fromList $ foldl' (\acc x -> x:acc) [] xs - len = V.length sorted - in if len `mod` 2 == 0 - then (sorted V.! (len `div` 2 - 1) + sorted V.! (len `div` 2)) / 2 - else sorted V.! (len `div` 2) - --- Standard deviation helper -stdDev :: [Double] -> Double -stdDev xs = let m = mean xs - variance = mean [(x - m) ^ 2 | x <- xs] - in sqrt variance - main :: IO () main = do putStrLn "# groupby-haskell.hs" hFlush stdout - let ver = "0.1.0" - let git = "cassava-csv" + let ver = "0.3.3" + let git = "dataframe" let task = "groupby" let solution = "haskell" let fun = "groupBy" @@ -159,7 +83,7 @@ main = do putStrLn $ "loading dataset " ++ dataName hFlush stdout - -- Check if data has NAs (simplified check) + -- Check if data has NAs let parts = T.splitOn "_" (T.pack dataName) let naFlag = if length parts > 3 then read (T.unpack $ parts !! 3) :: Int else 0 @@ -168,148 +92,221 @@ main = do hPutStrLn stderr "skip due to na_flag>0" return () else do - -- Load CSV data - csvData <- BL.readFile srcFile - case Csv.decodeByName csvData of - Left err -> do - putStrLn $ "Error parsing CSV: " ++ err - return () - Right (_, rows) -> do - let x = V.toList rows :: [Row] - let inRows = length x - putStrLn $ show inRows - hFlush stdout - - putStrLn "grouping..." - hFlush stdout - - -- Question 1: sum v1 by id1 - let question1 = "sum v1 by id1" - (ans1, t1_1) <- timeIt $ do - let grouped = groupByKey id1 x - let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m1_1 <- getMemoryUsage - (chk1, chkt1_1) <- timeIt $ evaluate $ sum [snd r | r <- ans1] - writeLog task dataName inRows question1 (length ans1) 2 solution ver git fun 1 t1_1 m1_1 cache (makeChk [chk1]) chkt1_1 onDisk machineType - - -- Run 2 - (ans1_2, t1_2) <- timeIt $ do - let grouped = groupByKey id1 x - let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m1_2 <- getMemoryUsage - (chk1_2, chkt1_2) <- timeIt $ evaluate $ sum [snd r | r <- ans1_2] - writeLog task dataName inRows question1 (length ans1_2) 2 solution ver git fun 2 t1_2 m1_2 cache (makeChk [chk1_2]) chkt1_2 onDisk machineType - putStrLn $ "Question 1 completed: " ++ show (length ans1_2) ++ " groups" - - -- Question 2: sum v1 by id1:id2 - let question2 = "sum v1 by id1:id2" - (ans2, t2_1) <- timeIt $ do - let grouped = groupByKey (\r -> (id1 r, id2 r)) x - let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m2_1 <- getMemoryUsage - (chk2, chkt2_1) <- timeIt $ evaluate $ sum [snd r | r <- ans2] - writeLog task dataName inRows question2 (length ans2) 3 solution ver git fun 1 t2_1 m2_1 cache (makeChk [chk2]) chkt2_1 onDisk machineType - - -- Run 2 - (ans2_2, t2_2) <- timeIt $ do - let grouped = groupByKey (\r -> (id1 r, id2 r)) x - let result = [(k, sum [v1 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m2_2 <- getMemoryUsage - (chk2_2, chkt2_2) <- timeIt $ evaluate $ sum [snd r | r <- ans2_2] - writeLog task dataName inRows question2 (length ans2_2) 3 solution ver git fun 2 t2_2 m2_2 cache (makeChk [chk2_2]) chkt2_2 onDisk machineType - putStrLn $ "Question 2 completed: " ++ show (length ans2_2) ++ " groups" - - -- Question 3: sum v1 mean v3 by id3 - let question3 = "sum v1 mean v3 by id3" - (ans3, t3_1) <- timeIt $ do - let grouped = groupByKey id3 x - let result = [(k, sum [v1 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m3_1 <- getMemoryUsage - (chk3, chkt3_1) <- timeIt $ do - let s1 = sum [(\(_,a,_) -> a) r | r <- ans3] - let s2 = sum [(\(_,_,b) -> b) r | r <- ans3] - evaluate (s1, s2) - return (s1, s2) - writeLog task dataName inRows question3 (length ans3) 3 solution ver git fun 1 t3_1 m3_1 cache (makeChk [fst chk3, snd chk3]) chkt3_1 onDisk machineType - - -- Run 2 - (ans3_2, t3_2) <- timeIt $ do - let grouped = groupByKey id3 x - let result = [(k, sum [v1 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m3_2 <- getMemoryUsage - (chk3_2, chkt3_2) <- timeIt $ do - let s1 = sum [(\(_,a,_) -> a) r | r <- ans3_2] - let s2 = sum [(\(_,_,b) -> b) r | r <- ans3_2] - evaluate (s1, s2) - return (s1, s2) - writeLog task dataName inRows question3 (length ans3_2) 3 solution ver git fun 2 t3_2 m3_2 cache (makeChk [fst chk3_2, snd chk3_2]) chkt3_2 onDisk machineType - putStrLn $ "Question 3 completed: " ++ show (length ans3_2) ++ " groups" - - -- Question 4: mean v1:v3 by id4 - let question4 = "mean v1:v3 by id4" - (ans4, t4_1) <- timeIt $ do - let grouped = groupByKey id4 x - let result = [(k, mean [v1 r | r <- rows], mean [v2 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m4_1 <- getMemoryUsage - (chk4, chkt4_1) <- timeIt $ do - let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans4] - let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans4] - let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans4] - evaluate (s1, s2, s3) - return (s1, s2, s3) - writeLog task dataName inRows question4 (length ans4) 4 solution ver git fun 1 t4_1 m4_1 cache (makeChk [(\(a,_,_) -> a) chk4, (\(_,b,_) -> b) chk4, (\(_,_,c) -> c) chk4]) chkt4_1 onDisk machineType - - -- Run 2 - (ans4_2, t4_2) <- timeIt $ do - let grouped = groupByKey id4 x - let result = [(k, mean [v1 r | r <- rows], mean [v2 r | r <- rows], mean [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m4_2 <- getMemoryUsage - (chk4_2, chkt4_2) <- timeIt $ do - let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans4_2] - let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans4_2] - let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans4_2] - evaluate (s1, s2, s3) - return (s1, s2, s3) - writeLog task dataName inRows question4 (length ans4_2) 4 solution ver git fun 2 t4_2 m4_2 cache (makeChk [(\(a,_,_) -> a) chk4_2, (\(_,b,_) -> b) chk4_2, (\(_,_,c) -> c) chk4_2]) chkt4_2 onDisk machineType - putStrLn $ "Question 4 completed: " ++ show (length ans4_2) ++ " groups" - - -- Question 5: sum v1:v3 by id6 - let question5 = "sum v1:v3 by id6" - (ans5, t5_1) <- timeIt $ do - let grouped = groupByKey id6 x - let result = [(k, sum [v1 r | r <- rows], sum [v2 r | r <- rows], sum [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m5_1 <- getMemoryUsage - (chk5, chkt5_1) <- timeIt $ do - let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans5] - let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans5] - let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans5] - evaluate (s1, s2, s3) - return (s1, s2, s3) - writeLog task dataName inRows question5 (length ans5) 4 solution ver git fun 1 t5_1 m5_1 cache (makeChk [(\(a,_,_) -> a) chk5, (\(_,b,_) -> b) chk5, (\(_,_,c) -> c) chk5]) chkt5_1 onDisk machineType - - -- Run 2 - (ans5_2, t5_2) <- timeIt $ do - let grouped = groupByKey id6 x - let result = [(k, sum [v1 r | r <- rows], sum [v2 r | r <- rows], sum [v3 r | r <- rows]) | (k, rows) <- HM.toList grouped] - return result - m5_2 <- getMemoryUsage - (chk5_2, chkt5_2) <- timeIt $ do - let s1 = sum [(\(_,a,_,_) -> a) r | r <- ans5_2] - let s2 = sum [(\(_,_,b,_) -> b) r | r <- ans5_2] - let s3 = sum [(\(_,_,_,c) -> c) r | r <- ans5_2] - evaluate (s1, s2, s3) - return (s1, s2, s3) - writeLog task dataName inRows question5 (length ans5_2) 4 solution ver git fun 2 t5_2 m5_2 cache (makeChk [(\(a,_,_) -> a) chk5_2, (\(_,b,_) -> b) chk5_2, (\(_,_,c) -> c) chk5_2]) chkt5_2 onDisk machineType - putStrLn $ "Question 5 completed: " ++ show (length ans5_2) ++ " groups" - - putStrLn "Haskell groupby benchmark completed (5 questions implemented)!" - putStrLn "Note: Questions 6-10 would require median, regression, and top-n functions." + -- Load CSV data using dataframe + x <- D.readCsv srcFile + + let (inRows, _) = D.dimensions x + putStrLn $ show inRows + hFlush stdout + + putStrLn "grouping..." + hFlush stdout + + -- Question 1: sum v1 by id1 + let question1 = "sum v1 by id1" + (ans1, t1_1) <- timeIt $ do + let grouped = D.groupBy ["id1"] x + let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + return result + m1_1 <- getMemoryUsage + let (outRows1, outCols1) = D.dimensions ans1 + (chk1, chkt1_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans1 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate sumV1 + return sumV1 + writeLog task dataName inRows question1 outRows1 outCols1 solution ver git fun 1 t1_1 m1_1 cache (makeChk [chk1]) chkt1_1 onDisk machineType + + -- Run 2 + (ans1_2, t1_2) <- timeIt $ do + let grouped = D.groupBy ["id1"] x + let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + return result + m1_2 <- getMemoryUsage + let (outRows1_2, outCols1_2) = D.dimensions ans1_2 + (chk1_2, chkt1_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans1_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate sumV1 + return sumV1 + writeLog task dataName inRows question1 outRows1_2 outCols1_2 solution ver git fun 2 t1_2 m1_2 cache (makeChk [chk1_2]) chkt1_2 onDisk machineType + putStrLn $ "Question 1 completed: " ++ show outRows1_2 ++ " groups" + + -- Question 2: sum v1 by id1:id2 + let question2 = "sum v1 by id1:id2" + (ans2, t2_1) <- timeIt $ do + let grouped = D.groupBy ["id1", "id2"] x + let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + return result + m2_1 <- getMemoryUsage + let (outRows2, outCols2) = D.dimensions ans2 + (chk2, chkt2_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate sumV1 + return sumV1 + writeLog task dataName inRows question2 outRows2 outCols2 solution ver git fun 1 t2_1 m2_1 cache (makeChk [chk2]) chkt2_1 onDisk machineType + + -- Run 2 + (ans2_2, t2_2) <- timeIt $ do + let grouped = D.groupBy ["id1", "id2"] x + let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + return result + m2_2 <- getMemoryUsage + let (outRows2_2, outCols2_2) = D.dimensions ans2_2 + (chk2_2, chkt2_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans2_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate sumV1 + return sumV1 + writeLog task dataName inRows question2 outRows2_2 outCols2_2 solution ver git fun 2 t2_2 m2_2 cache (makeChk [chk2_2]) chkt2_2 onDisk machineType + putStrLn $ "Question 2 completed: " ++ show outRows2_2 ++ " groups" + + -- Question 3: sum v1 mean v3 by id3 + let question3 = "sum v1 mean v3 by id3" + (ans3, t3_1) <- timeIt $ do + let grouped = D.groupBy ["id3"] x + let result = D.aggregate + [F.sum (F.col @Double "v1") `F.as` "v1_sum", + F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + return result + m3_1 <- getMemoryUsage + let (outRows3, outCols3) = D.dimensions ans3 + (chk3, chkt3_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans3 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV3 = case D.columnAsDoubleVector "v3_mean" ans3 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV3) + return (sumV1, sumV3) + writeLog task dataName inRows question3 outRows3 outCols3 solution ver git fun 1 t3_1 m3_1 cache (makeChk [fst chk3, snd chk3]) chkt3_1 onDisk machineType + + -- Run 2 + (ans3_2, t3_2) <- timeIt $ do + let grouped = D.groupBy ["id3"] x + let result = D.aggregate + [F.sum (F.col @Double "v1") `F.as` "v1_sum", + F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + return result + m3_2 <- getMemoryUsage + let (outRows3_2, outCols3_2) = D.dimensions ans3_2 + (chk3_2, chkt3_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans3_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV3 = case D.columnAsDoubleVector "v3_mean" ans3_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV3) + return (sumV1, sumV3) + writeLog task dataName inRows question3 outRows3_2 outCols3_2 solution ver git fun 2 t3_2 m3_2 cache (makeChk [fst chk3_2, snd chk3_2]) chkt3_2 onDisk machineType + putStrLn $ "Question 3 completed: " ++ show outRows3_2 ++ " groups" + + -- Question 4: mean v1:v3 by id4 + let question4 = "mean v1:v3 by id4" + (ans4, t4_1) <- timeIt $ do + let grouped = D.groupBy ["id4"] x + let result = D.aggregate + [F.mean (F.col @Double "v1") `F.as` "v1_mean", + F.mean (F.col @Double "v2") `F.as` "v2_mean", + F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + return result + m4_1 <- getMemoryUsage + let (outRows4, outCols4) = D.dimensions ans4 + (chk4, chkt4_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_mean" ans4 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2_mean" ans4 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV3 = case D.columnAsDoubleVector "v3_mean" ans4 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2, sumV3) + return (sumV1, sumV2, sumV3) + writeLog task dataName inRows question4 outRows4 outCols4 solution ver git fun 1 t4_1 m4_1 cache (makeChk [(\(a,_,_) -> a) chk4, (\(_,b,_) -> b) chk4, (\(_,_,c) -> c) chk4]) chkt4_1 onDisk machineType + + -- Run 2 + (ans4_2, t4_2) <- timeIt $ do + let grouped = D.groupBy ["id4"] x + let result = D.aggregate + [F.mean (F.col @Double "v1") `F.as` "v1_mean", + F.mean (F.col @Double "v2") `F.as` "v2_mean", + F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + return result + m4_2 <- getMemoryUsage + let (outRows4_2, outCols4_2) = D.dimensions ans4_2 + (chk4_2, chkt4_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_mean" ans4_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2_mean" ans4_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV3 = case D.columnAsDoubleVector "v3_mean" ans4_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2, sumV3) + return (sumV1, sumV2, sumV3) + writeLog task dataName inRows question4 outRows4_2 outCols4_2 solution ver git fun 2 t4_2 m4_2 cache (makeChk [(\(a,_,_) -> a) chk4_2, (\(_,b,_) -> b) chk4_2, (\(_,_,c) -> c) chk4_2]) chkt4_2 onDisk machineType + putStrLn $ "Question 4 completed: " ++ show outRows4_2 ++ " groups" + + -- Question 5: sum v1:v3 by id6 + let question5 = "sum v1:v3 by id6" + (ans5, t5_1) <- timeIt $ do + let grouped = D.groupBy ["id6"] x + let result = D.aggregate + [F.sum (F.col @Double "v1") `F.as` "v1_sum", + F.sum (F.col @Double "v2") `F.as` "v2_sum", + F.sum (F.col @Double "v3") `F.as` "v3_sum"] grouped + return result + m5_1 <- getMemoryUsage + let (outRows5, outCols5) = D.dimensions ans5 + (chk5, chkt5_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans5 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2_sum" ans5 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV3 = case D.columnAsDoubleVector "v3_sum" ans5 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2, sumV3) + return (sumV1, sumV2, sumV3) + writeLog task dataName inRows question5 outRows5 outCols5 solution ver git fun 1 t5_1 m5_1 cache (makeChk [(\(a,_,_) -> a) chk5, (\(_,b,_) -> b) chk5, (\(_,_,c) -> c) chk5]) chkt5_1 onDisk machineType + + -- Run 2 + (ans5_2, t5_2) <- timeIt $ do + let grouped = D.groupBy ["id6"] x + let result = D.aggregate + [F.sum (F.col @Double "v1") `F.as` "v1_sum", + F.sum (F.col @Double "v2") `F.as` "v2_sum", + F.sum (F.col @Double "v3") `F.as` "v3_sum"] grouped + return result + m5_2 <- getMemoryUsage + let (outRows5_2, outCols5_2) = D.dimensions ans5_2 + (chk5_2, chkt5_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1_sum" ans5_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2_sum" ans5_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV3 = case D.columnAsDoubleVector "v3_sum" ans5_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2, sumV3) + return (sumV1, sumV2, sumV3) + writeLog task dataName inRows question5 outRows5_2 outCols5_2 solution ver git fun 2 t5_2 m5_2 cache (makeChk [(\(a,_,_) -> a) chk5_2, (\(_,b,_) -> b) chk5_2, (\(_,_,c) -> c) chk5_2]) chkt5_2 onDisk machineType + putStrLn $ "Question 5 completed: " ++ show outRows5_2 ++ " groups" + + putStrLn "Haskell dataframe groupby benchmark completed (5 questions implemented)!" + putStrLn "Note: Questions 6-10 would require median, regression, and top-n functions." diff --git a/haskell/haskell-benchmark.cabal b/haskell/haskell-benchmark.cabal index 3000aa92..5775f1a0 100644 --- a/haskell/haskell-benchmark.cabal +++ b/haskell/haskell-benchmark.cabal @@ -6,31 +6,23 @@ cabal-version: >=1.10 executable groupby-haskell main-is: groupby-haskell.hs build-depends: base >= 4.7 && < 5 - , bytestring >= 0.10 + , dataframe >= 0.3 , text >= 1.2 , vector >= 0.12 - , cassava >= 0.5 , time >= 1.9 , process >= 1.6 , directory >= 1.3 - , unordered-containers >= 0.2 - , hashable >= 1.3 - , deepseq >= 1.4 default-language: Haskell2010 ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N executable join-haskell main-is: join-haskell.hs build-depends: base >= 4.7 && < 5 - , bytestring >= 0.10 + , dataframe >= 0.3 , text >= 1.2 , vector >= 0.12 - , cassava >= 0.5 , time >= 1.9 , process >= 1.6 , directory >= 1.3 - , unordered-containers >= 0.2 - , hashable >= 1.3 - , deepseq >= 1.4 default-language: Haskell2010 ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N diff --git a/haskell/join-haskell.hs b/haskell/join-haskell.hs index a6e24ce2..32d0edcf 100644 --- a/haskell/join-haskell.hs +++ b/haskell/join-haskell.hs @@ -1,97 +1,19 @@ {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE TypeApplications #-} -import qualified Data.ByteString.Lazy as BL -import qualified Data.Csv as Csv +import qualified DataFrame as D +import qualified DataFrame.Operations.Join as DJ +import qualified Data.Text as T import qualified Data.Vector as V -import qualified Data.HashMap.Strict as HM -import Data.Hashable (Hashable) -import GHC.Generics (Generic) import System.Environment (getEnv, lookupEnv) import System.IO (hFlush, stdout) import Data.Time.Clock.POSIX (getPOSIXTime) import Control.Exception (evaluate) import System.Process (readProcess) import System.Directory (doesFileExist) -import Data.List (intercalate, foldl') -import qualified Data.Text as T -import Control.DeepSeq (NFData, force) - --- Data row types -data XRow = XRow - { x_id1 :: !Int - , x_id2 :: !Int - , x_id3 :: !Int - , x_id4 :: !T.Text - , x_id5 :: !T.Text - , x_id6 :: !T.Text - , x_v1 :: !Double - } deriving (Show, Generic) - -data SmallRow = SmallRow - { s_id1 :: !Int - , s_id4 :: !T.Text - , s_v2 :: !Double - } deriving (Show, Generic) - -data MediumRow = MediumRow - { m_id1 :: !Int - , m_id2 :: !Int - , m_id4 :: !T.Text - , m_id5 :: !T.Text - , m_v2 :: !Double - } deriving (Show, Generic) - -data BigRow = BigRow - { b_id1 :: !Int - , b_id2 :: !Int - , b_id3 :: !Int - , b_id4 :: !T.Text - , b_id5 :: !T.Text - , b_id6 :: !T.Text - , b_v2 :: !Double - } deriving (Show, Generic) - -instance NFData XRow -instance NFData SmallRow -instance NFData MediumRow -instance NFData BigRow - -instance Csv.FromNamedRecord XRow where - parseNamedRecord r = XRow - <$> r Csv..: "id1" - <$> r Csv..: "id2" - <$> r Csv..: "id3" - <$> r Csv..: "id4" - <$> r Csv..: "id5" - <$> r Csv..: "id6" - <$> r Csv..: "v1" - -instance Csv.FromNamedRecord SmallRow where - parseNamedRecord r = SmallRow - <$> r Csv..: "id1" - <$> r Csv..: "id4" - <$> r Csv..: "v2" - -instance Csv.FromNamedRecord MediumRow where - parseNamedRecord r = MediumRow - <$> r Csv..: "id1" - <$> r Csv..: "id2" - <$> r Csv..: "id4" - <$> r Csv..: "id5" - <$> r Csv..: "v2" - -instance Csv.FromNamedRecord BigRow where - parseNamedRecord r = BigRow - <$> r Csv..: "id1" - <$> r Csv..: "id2" - <$> r Csv..: "id3" - <$> r Csv..: "id4" - <$> r Csv..: "id5" - <$> r Csv..: "id6" - <$> r Csv..: "v2" - --- Helper functions (same as groupby) +import Data.List (intercalate) + +-- Helper functions for logging writeLog :: String -> String -> Int -> String -> Int -> Int -> String -> String -> String -> String -> Int -> Double -> Double -> String -> String -> Double -> String -> String -> IO () writeLog task dataName inRows question outRows outCols solution version git fun run timeSec memGb cache chk chkTimeSec onDisk machineType = do batch <- lookupEnv "BATCH" >>= return . maybe "" id @@ -133,39 +55,34 @@ getMemoryUsage = do let rssKb = if null mem then 0 else read mem :: Double return (rssKb / (1024 * 1024)) -timeIt :: NFData a => IO a -> IO (a, Double) +timeIt :: IO a -> IO (a, Double) timeIt action = do start <- getPOSIXTime result <- action - _ <- evaluate (force result) + _ <- evaluate result end <- getPOSIXTime return (result, realToFrac (end - start)) --- Join helper for inner join on single key -innerJoinInt :: [XRow] -> [SmallRow] -> [(XRow, SmallRow)] -innerJoinInt xs ys = - let yMap = foldl' (\acc y -> HM.insertWith (++) (s_id1 y) [y] acc) HM.empty ys - in concat [[(x, y) | y <- HM.lookupDefault [] (x_id1 x) yMap] | x <- xs] - --- Parse join_to_tbls logic +-- Parse join_to_tbls logic to get table names joinToTbls :: String -> [String] joinToTbls dataName = let parts = T.splitOn "_" (T.pack dataName) - xn = if length parts > 1 then read (T.unpack $ parts !! 1) :: Double else 1e7 - yn = [show (floor (xn / 1e6) :: Int) ++ "e4", - show (floor (xn / 1e3) :: Int) ++ "e3", - show (floor xn :: Int)] - in [T.unpack $ T.replace "NA" (T.pack (yn !! 0)) (T.pack dataName), - T.unpack $ T.replace "NA" (T.pack (yn !! 1)) (T.pack dataName), - T.unpack $ T.replace "NA" (T.pack (yn !! 2)) (T.pack dataName)] + xnStr = if length parts > 1 then T.unpack (parts !! 1) else "1e7" + xn = read xnStr :: Double + yn1 = show (floor (xn / 1e6) :: Int) ++ "e4" + yn2 = show (floor (xn / 1e3) :: Int) ++ "e3" + yn3 = show (floor xn :: Int) + in [T.unpack $ T.replace "NA" (T.pack yn1) (T.pack dataName), + T.unpack $ T.replace "NA" (T.pack yn2) (T.pack dataName), + T.unpack $ T.replace "NA" (T.pack yn3) (T.pack dataName)] main :: IO () main = do putStrLn "# join-haskell.hs" hFlush stdout - let ver = "0.1.0" - let git = "cassava-csv" + let ver = "0.3.3" + let git = "dataframe" let task = "join" let solution = "haskell" let fun = "innerJoin" @@ -185,58 +102,204 @@ main = do yDataNames !! 0 ++ ", " ++ yDataNames !! 1 ++ ", " ++ yDataNames !! 2 hFlush stdout - -- Load all datasets - csvDataX <- BL.readFile srcJnX - csvDataSmall <- BL.readFile (srcJnY !! 0) - csvDataMedium <- BL.readFile (srcJnY !! 1) - csvDataBig <- BL.readFile (srcJnY !! 2) - - case (Csv.decodeByName csvDataX, Csv.decodeByName csvDataSmall, - Csv.decodeByName csvDataMedium, Csv.decodeByName csvDataBig) of - (Right (_, xRows), Right (_, smallRows), Right (_, mediumRows), Right (_, bigRows)) -> do - let x = V.toList xRows :: [XRow] - let small = V.toList smallRows :: [SmallRow] - let medium = V.toList mediumRows :: [MediumRow] - let big = V.toList bigRows :: [BigRow] - - putStrLn $ show (length x) - putStrLn $ show (length small) - putStrLn $ show (length medium) - putStrLn $ show (length big) - hFlush stdout - - putStrLn "joining..." - hFlush stdout - - let inRows = length x - - -- Question 1: small inner on int - let question1 = "small inner on int" - (ans1, t1_1) <- timeIt $ do - let result = innerJoinInt x small - return result - m1_1 <- getMemoryUsage - (chk1, chkt1_1) <- timeIt $ do - let sumV1 = sum [x_v1 xr | (xr, _) <- ans1] - let sumV2 = sum [s_v2 sr | (_, sr) <- ans1] - evaluate (sumV1, sumV2) - return (sumV1, sumV2) - writeLog task dataName inRows question1 (length ans1) 8 solution ver git fun 1 t1_1 m1_1 cache (makeChk [fst chk1, snd chk1]) chkt1_1 onDisk machineType - - -- Run 2 - (ans1_2, t1_2) <- timeIt $ do - let result = innerJoinInt x small - return result - m1_2 <- getMemoryUsage - (chk1_2, chkt1_2) <- timeIt $ do - let sumV1 = sum [x_v1 xr | (xr, _) <- ans1_2] - let sumV2 = sum [s_v2 sr | (_, sr) <- ans1_2] - evaluate (sumV1, sumV2) - return (sumV1, sumV2) - writeLog task dataName inRows question1 (length ans1_2) 8 solution ver git fun 2 t1_2 m1_2 cache (makeChk [fst chk1_2, snd chk1_2]) chkt1_2 onDisk machineType - putStrLn $ "Question 1 completed: " ++ show (length ans1_2) ++ " rows" - - putStrLn "Haskell join benchmark completed (1 question implemented)!" - putStrLn "Note: Other join questions would require additional join types and key combinations." - - _ -> putStrLn "Error parsing CSV files" + -- Load all datasets using dataframe + x <- D.readCsv srcJnX + small <- D.readCsv (srcJnY !! 0) + medium <- D.readCsv (srcJnY !! 1) + big <- D.readCsv (srcJnY !! 2) + + let (xRows, _) = D.dimensions x + let (smallRows, _) = D.dimensions small + let (mediumRows, _) = D.dimensions medium + let (bigRows, _) = D.dimensions big + + putStrLn $ show xRows + putStrLn $ show smallRows + putStrLn $ show mediumRows + putStrLn $ show bigRows + hFlush stdout + + putStrLn "joining..." + hFlush stdout + + -- Question 1: small inner on int + let question1 = "small inner on int" + (ans1, t1_1) <- timeIt $ do + let result = DJ.innerJoin ["id1"] x small + return result + m1_1 <- getMemoryUsage + let (outRows1, outCols1) = D.dimensions ans1 + (chk1, chkt1_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans1 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans1 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question1 outRows1 outCols1 solution ver git fun 1 t1_1 m1_1 cache (makeChk [fst chk1, snd chk1]) chkt1_1 onDisk machineType + + -- Run 2 + (ans1_2, t1_2) <- timeIt $ do + let result = DJ.innerJoin ["id1"] x small + return result + m1_2 <- getMemoryUsage + let (outRows1_2, outCols1_2) = D.dimensions ans1_2 + (chk1_2, chkt1_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans1_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans1_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question1 outRows1_2 outCols1_2 solution ver git fun 2 t1_2 m1_2 cache (makeChk [fst chk1_2, snd chk1_2]) chkt1_2 onDisk machineType + putStrLn $ "Question 1 completed: " ++ show outRows1_2 ++ " rows" + + -- Question 2: medium inner on int + let question2 = "medium inner on int" + (ans2, t2_1) <- timeIt $ do + let result = DJ.innerJoin ["id1"] x medium + return result + m2_1 <- getMemoryUsage + let (outRows2, outCols2) = D.dimensions ans2 + (chk2, chkt2_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question2 outRows2 outCols2 solution ver git fun 1 t2_1 m2_1 cache (makeChk [fst chk2, snd chk2]) chkt2_1 onDisk machineType + + -- Run 2 + (ans2_2, t2_2) <- timeIt $ do + let result = DJ.innerJoin ["id1"] x medium + return result + m2_2 <- getMemoryUsage + let (outRows2_2, outCols2_2) = D.dimensions ans2_2 + (chk2_2, chkt2_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans2_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans2_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question2 outRows2_2 outCols2_2 solution ver git fun 2 t2_2 m2_2 cache (makeChk [fst chk2_2, snd chk2_2]) chkt2_2 onDisk machineType + putStrLn $ "Question 2 completed: " ++ show outRows2_2 ++ " rows" + + -- Question 3: medium outer on int + let question3 = "medium outer on int" + (ans3, t3_1) <- timeIt $ do + let result = DJ.leftJoin ["id1"] x medium + return result + m3_1 <- getMemoryUsage + let (outRows3, outCols3) = D.dimensions ans3 + (chk3, chkt3_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans3 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans3 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question3 outRows3 outCols3 solution ver git fun 1 t3_1 m3_1 cache (makeChk [fst chk3, snd chk3]) chkt3_1 onDisk machineType + + -- Run 2 + (ans3_2, t3_2) <- timeIt $ do + let result = DJ.leftJoin ["id1"] x medium + return result + m3_2 <- getMemoryUsage + let (outRows3_2, outCols3_2) = D.dimensions ans3_2 + (chk3_2, chkt3_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans3_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans3_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question3 outRows3_2 outCols3_2 solution ver git fun 2 t3_2 m3_2 cache (makeChk [fst chk3_2, snd chk3_2]) chkt3_2 onDisk machineType + putStrLn $ "Question 3 completed: " ++ show outRows3_2 ++ " rows" + + -- Question 4: medium inner on factor + let question4 = "medium inner on factor" + (ans4, t4_1) <- timeIt $ do + let result = DJ.innerJoin ["id4"] x medium + return result + m4_1 <- getMemoryUsage + let (outRows4, outCols4) = D.dimensions ans4 + (chk4, chkt4_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans4 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans4 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question4 outRows4 outCols4 solution ver git fun 1 t4_1 m4_1 cache (makeChk [fst chk4, snd chk4]) chkt4_1 onDisk machineType + + -- Run 2 + (ans4_2, t4_2) <- timeIt $ do + let result = DJ.innerJoin ["id4"] x medium + return result + m4_2 <- getMemoryUsage + let (outRows4_2, outCols4_2) = D.dimensions ans4_2 + (chk4_2, chkt4_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans4_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans4_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question4 outRows4_2 outCols4_2 solution ver git fun 2 t4_2 m4_2 cache (makeChk [fst chk4_2, snd chk4_2]) chkt4_2 onDisk machineType + putStrLn $ "Question 4 completed: " ++ show outRows4_2 ++ " rows" + + -- Question 5: big inner on int + let question5 = "big inner on int" + (ans5, t5_1) <- timeIt $ do + let result = DJ.innerJoin ["id1"] x big + return result + m5_1 <- getMemoryUsage + let (outRows5, outCols5) = D.dimensions ans5 + (chk5, chkt5_1) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans5 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans5 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question5 outRows5 outCols5 solution ver git fun 1 t5_1 m5_1 cache (makeChk [fst chk5, snd chk5]) chkt5_1 onDisk machineType + + -- Run 2 + (ans5_2, t5_2) <- timeIt $ do + let result = DJ.innerJoin ["id1"] x big + return result + m5_2 <- getMemoryUsage + let (outRows5_2, outCols5_2) = D.dimensions ans5_2 + (chk5_2, chkt5_2) <- timeIt $ do + let sumV1 = case D.columnAsDoubleVector "v1" ans5_2 of + Right vec -> V.sum vec + Left _ -> 0 + let sumV2 = case D.columnAsDoubleVector "v2" ans5_2 of + Right vec -> V.sum vec + Left _ -> 0 + evaluate (sumV1, sumV2) + return (sumV1, sumV2) + writeLog task dataName xRows question5 outRows5_2 outCols5_2 solution ver git fun 2 t5_2 m5_2 cache (makeChk [fst chk5_2, snd chk5_2]) chkt5_2 onDisk machineType + putStrLn $ "Question 5 completed: " ++ show outRows5_2 ++ " rows" + + putStrLn "Haskell dataframe join benchmark completed (5 questions implemented)!" diff --git a/haskell/setup-haskell.sh b/haskell/setup-haskell.sh index 0cf99700..391e93e1 100755 --- a/haskell/setup-haskell.sh +++ b/haskell/setup-haskell.sh @@ -14,54 +14,6 @@ if [ ! -f "stack.yaml" ]; then stack init --force fi -# Create cabal file if it doesn't exist -if [ ! -f "haskell-benchmark.cabal" ]; then - cat > haskell-benchmark.cabal << 'EOF' -name: haskell-benchmark -version: 0.1.0.0 -build-type: Simple -cabal-version: >=1.10 - -executable groupby-haskell - main-is: groupby-haskell.hs - build-depends: base >= 4.7 && < 5 - , Frames >= 0.7 - , vinyl >= 0.13 - , text >= 1.2 - , bytestring >= 0.10 - , vector >= 0.12 - , cassava >= 0.5 - , pipes >= 4.3 - , time >= 1.9 - , process >= 1.6 - , directory >= 1.3 - , containers >= 0.6 - , hashable >= 1.3 - , unordered-containers >= 0.2 - default-language: Haskell2010 - ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N - -executable join-haskell - main-is: join-haskell.hs - build-depends: base >= 4.7 && < 5 - , Frames >= 0.7 - , vinyl >= 0.13 - , text >= 1.2 - , bytestring >= 0.10 - , vector >= 0.12 - , cassava >= 0.5 - , pipes >= 4.3 - , time >= 1.9 - , process >= 1.6 - , directory >= 1.3 - , containers >= 0.6 - , hashable >= 1.3 - , unordered-containers >= 0.2 - default-language: Haskell2010 - ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N -EOF -fi - # Install dependencies and build stack setup stack build --only-dependencies diff --git a/haskell/ver-haskell.sh b/haskell/ver-haskell.sh index a377718b..7ba223d1 100755 --- a/haskell/ver-haskell.sh +++ b/haskell/ver-haskell.sh @@ -3,26 +3,18 @@ set -e cd haskell -# Get GHC version (Haskell compiler) -GHC_VERSION=$(stack ghc -- --numeric-version 2>/dev/null || echo "unknown") - -# Get Frames version from stack -FRAMES_VERSION=$(stack list-dependencies --depth 1 2>/dev/null | grep "^Frames " | awk '{print $2}' || echo "unknown") - -# If Frames version is unknown, try from package.yaml or stack.yaml -if [ "$FRAMES_VERSION" = "unknown" ]; then - FRAMES_VERSION=$(stack exec -- ghc-pkg field Frames version 2>/dev/null | awk '{print $2}' || echo "0.7.0") -fi +# Get dataframe version from stack +DF_VERSION=$(stack exec -- ghc-pkg field dataframe version 2>/dev/null | awk '{print $2}' || echo "0.3.3") # Write version to VERSION file -echo "${FRAMES_VERSION}" > VERSION +echo "${DF_VERSION}" > VERSION -# Get git revision if available -GIT_REV=$(cd $(stack path --local-install-root 2>/dev/null || echo ".") && git rev-parse --short HEAD 2>/dev/null || echo "") +# Get git revision of dataframe if available +GIT_REV=$(stack path --local-install-root 2>/dev/null && git -C $(stack path --local-install-root 2>/dev/null || echo ".") rev-parse --short HEAD 2>/dev/null || echo "") if [ -n "$GIT_REV" ]; then echo "$GIT_REV" > REVISION else - echo "GHC-${GHC_VERSION}" > REVISION + echo "dataframe-${DF_VERSION}" > REVISION fi cd .. From 174678c0c5b0d1b2c722f5e864dabd134dba6e6e Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Wed, 19 Nov 2025 08:29:00 -0800 Subject: [PATCH 3/6] chore: Add haskell build files to .gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 78694453..23b7720c 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ workdir/ timeout-exit-codes.out */target *.lock +.dist-newstyle From 511e1691344338ab7c95fa9a49bced11d851fe56 Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Wed, 19 Nov 2025 08:29:47 -0800 Subject: [PATCH 4/6] fix: dist-newstyle is not a hidden folder. --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 23b7720c..96d1bd44 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,4 @@ workdir/ timeout-exit-codes.out */target *.lock -.dist-newstyle +dist-newstyle From 7161afb98b3031601cad6d0f0e12600ad01780f4 Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Wed, 19 Nov 2025 10:34:48 -0800 Subject: [PATCH 5/6] chore: Add stack work to gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 96d1bd44..8ab87bb6 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ timeout-exit-codes.out */target *.lock dist-newstyle +.stack-work From 924fb928290a6d953c6e295949be97f6455fba08 Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Wed, 19 Nov 2025 12:59:16 -0800 Subject: [PATCH 6/6] feat: Add edits to make haskell benchmark run. --- .github/workflows/regression.yml | 2 +- README.md | 1 + _benchplot/benchplot-dict.R | 36 +- _launcher/launcher.R | 2 +- _launcher/solution.R | 4 +- _report/report.R | 2 +- haskell/VERSION | 2 +- haskell/exec.sh | 6 + haskell/groupby-haskell.hs | 773 ++++++++++++++++++++++++------- haskell/haskell-benchmark.cabal | 2 + haskell/join-haskell.hs | 386 ++++++++++++--- haskell/setup-haskell.sh | 4 +- haskell/stack.yaml | 67 +++ run.conf | 2 +- run.sh | 2 + 15 files changed, 1040 insertions(+), 251 deletions(-) create mode 100755 haskell/exec.sh mode change 100644 => 100755 haskell/groupby-haskell.hs create mode 100644 haskell/stack.yaml diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 41717226..d7173efa 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - solution: [data.table, collapse, dplyr, pandas, spark, polars, R-arrow, duckdb, datafusion, dask, clickhouse, chdb] + solution: [data.table, collapse, dplyr, pandas, spark, polars, R-arrow, duckdb, datafusion, dask, clickhouse, chdb, haskell] name: Solo solutions runs-on: ubuntu-latest env: diff --git a/README.md b/README.md index 9bbe2c6b..d3b8e601 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Contribution and feedback are very welcome! - [x] [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl) - [x] [In Memory DataSets](https://github.com/sl-solution/InMemoryDatasets.jl) - [x] [Datafusion](https://github.com/apache/arrow-datafusion) + - [x] [(haskell)dataframe](https://github.com/mchav/dataframe) If you would like your solution to be included, feel free to file a PR with the necessary setup-_solution_/ver-_solution_/groupby-_solution_/join-_solution_ scripts. If the team at DuckDB Labs approves the PR it will be merged. In the interest of transparency and fairness, only results from open-source data-science tools will be merged. diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R index fedc4aac..c944579a 100644 --- a/_benchplot/benchplot-dict.R +++ b/_benchplot/benchplot-dict.R @@ -46,7 +46,8 @@ solution.dict = {list( "duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")), "duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")), "datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3")), - "chdb" = list(name=c(short="chdb", long="chDB"), color=c(strong="hotpink4", light="hotpink1")) + "chdb" = list(name=c(short="chdb", long="chDB"), color=c(strong="hotpink4", light="hotpink1")), + "haskell" = list(name=c(short="haskell", long="Haskell"), color=c(strong="#3d0569ff", light="#61298bff")), )} #barplot(rep(c(0L,1L,1L), length(solution.dict)), # col=rev(c(rbind(sapply(solution.dict, `[[`, "color"), "black"))), @@ -259,7 +260,19 @@ groupby.syntax.dict = {list( "largest two v3 by id6" = "SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM db_benchmark.x WHERE v3 IS NOT NULL) AS subq GROUP BY id6", "regression v1 v2 by id2 id4" = "SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM db_benchmark.x GROUP BY id2, id4", "sum v3 count by id1:id6" = "SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM db_benchmark.x GROUP BY id1, id2, id3, id4, id5, id6" - )} + )}, + "haskell" = {c( + "sum v1 by id1" = "df |> D.groupby [\"id1\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\")]", + "sum v1 by id1:id2" = "df |> D.groupby [\"id1\", \"id2\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\")]", + "sum v1 mean v3 by id3" = "df |> D.groupby [\"id3\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\"), \"v3_mean\" .= F.mean (F.col @Double \"v3\")]", + "mean v1:v3 by id4" = "df |> D.groupby [\"id4\"] |> D.aggregate [\"v1_mean\" .= F.mean (F.col @Int \"v1\"), \"v2_mean\" .= F.mean (F.col @Int \"v2\"), \"v3_mean\" .= F.mean (F.col @Double \"v3\")]", + "sum v1:v3 by id6" = "df |> D.groupby [\"id6\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\"), \"v2_sum\" .= F.sum (F.col @Int \"v2\"), \"v3_sum\" .= F.sum (F.col @Double \"v3\")]", + "median v3 sd v3 by id4 id5" = "df |> D.groupby [\"id4\", \"id5\"] |> D.aggregate [\"v3_median\" .= F.median (F.col @Doublee \"v3\"), \"v3_sd\" .= F.stddev (F.col @Double \"v3\")]", + "max v1 - min v2 by id3" = "df |> D.groupby [\"id3\"] |> D.aggregate [\"diff\" .= F.maximum (F.col @Int \"v1\") - F.minimum (F.col @Int \"v2\")]", + "largest two v3 by id6" = "", + "regression v1 v2 by id2 id4" = "", + "sum v3 count by id1:id6" = "df |> D.groupBy [\"id1\",\"id2\",\"id3\",\"id4\",\"id5\",\"id6\"]).agg([F.sum (F.col @Double \"v3\") `F.as` \"v3\", F..count (F.col @Int \"v1\") `F.as` \"count\"]" + )}, )} groupby.query.exceptions = {list( "collapse" = list(), @@ -277,7 +290,8 @@ groupby.syntax.dict = {list( "duckdb" = list(), "duckdb-latest" = list(), "datafusion" = list(), - "chdb" = list() + "chdb" = list(), + "haskell" = list() )} groupby.data.exceptions = {list( # exceptions as of run 1575727624 "collapse" = {list( @@ -348,6 +362,8 @@ groupby.data.exceptions = {list( "Not Tested" = c("G1_1e9_1e2_0_0") )}, "chdb" = {list( + )}, + "haskell" = {list( )} )} groupby.exceptions = task.exceptions(groupby.query.exceptions, groupby.data.exceptions) @@ -472,7 +488,14 @@ join.syntax.dict = {list( "medium outer on int" = "SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 as medium_id5, v2 FROM db_benchmark.x AS x LEFT JOIN db_benchmark.medium AS medium USING (id2)", "medium inner on factor" = "SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 as medium_id4, v2 FROM db_benchmark.x AS x INNER JOIN db_benchmark.medium AS medium USING (id5)", "big inner on int" = "SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 as big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM db_benchmark.x AS x INNER JOIN db_benchmark.big AS big USING (id3)" - )} + )}, + "haskell" = {c( + "small inner on int" = "D.innerJoin [\"id1\"] small small", + "medium inner on int" = "D.innerJoin [\"id2\"] medium medium", + "medium outer on int" = "D.leftJoin [\"id2\"] medium medium", + "medium inner on factor" = "D.innerJoin [\"id5\"] medium medium", + "big inner on int" = "D.innerJoin [\"id3\"] big big" + )}, )} join.query.exceptions = {list( "collapse" = list(), @@ -490,7 +513,8 @@ join.query.exceptions = {list( "duckdb" = list(), "duckdb-latest" = list(), "datafusion" = list(), - "chdb" = list() + "chdb" = list(), + "haskell" = list() )} join.data.exceptions = {list( # exceptions as of run 1575727624 "collapse" = {list( @@ -550,6 +574,8 @@ join.data.exceptions = {list( "Not tested" = c("J1_1e9_NA_0_0") )}, "chdb" = {list( + )}, + "haskell" = {list( )} )} join.exceptions = task.exceptions(join.query.exceptions, join.data.exceptions) diff --git a/_launcher/launcher.R b/_launcher/launcher.R index be0e4b2b..ae92dbdf 100644 --- a/_launcher/launcher.R +++ b/_launcher/launcher.R @@ -16,7 +16,7 @@ file.ext = function(x) { x, "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R", "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", - "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py" + "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py", "haskell"="hs", ) if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x)) ans diff --git a/_launcher/solution.R b/_launcher/solution.R index 98c4298e..48bf86b6 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -112,7 +112,7 @@ file.ext = function(x) { x, "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R", "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", - "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py" + "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py", "haskell"="hs", ) if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x)) ans @@ -153,7 +153,7 @@ setenv("SRC_DATANAME", d) ns = solution.path(s) ext = file.ext(s) -localcmd = if (s %in% c("clickhouse","h2o","juliadf", "juliads")) { # custom launcher bash script, for clickhouse h2o juliadf +localcmd = if (s %in% c("clickhouse","h2o","juliadf", "juliads", "haskell")) { # custom launcher bash script, for clickhouse h2o juliadf sprintf("exec.sh %s", t) } else if (s %in% c("dask")) { sprintf("%s_%s.%s", t, ns, ext) diff --git a/_report/report.R b/_report/report.R index a726b628..6cd5a483 100644 --- a/_report/report.R +++ b/_report/report.R @@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) { file.path(path, "report-done") } get_report_solutions = function() { - c("duckdb-latest", "collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "datafusion", "arrow", "R-arrow", "chdb") + c("duckdb-latest", "collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "datafusion", "arrow", "R-arrow", "chdb", "haskell") } get_data_levels = function() { ## groupby diff --git a/haskell/VERSION b/haskell/VERSION index 1c09c74e..1d45831f 100644 --- a/haskell/VERSION +++ b/haskell/VERSION @@ -1 +1 @@ -0.3.3 +0.3.3.7 diff --git a/haskell/exec.sh b/haskell/exec.sh new file mode 100755 index 00000000..65e04cc8 --- /dev/null +++ b/haskell/exec.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +cd ./haskell + +stack run "$1-haskell" diff --git a/haskell/groupby-haskell.hs b/haskell/groupby-haskell.hs old mode 100644 new mode 100755 index c796dffd..5be66398 --- a/haskell/groupby-haskell.hs +++ b/haskell/groupby-haskell.hs @@ -1,67 +1,23 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeApplications #-} -import qualified DataFrame as D -import qualified DataFrame.Expressions as F +import Control.Exception (evaluate) +import Data.List (intercalate) +import Data.Maybe import qualified Data.Text as T +import Data.Time.Clock.POSIX (getPOSIXTime) import qualified Data.Vector as V +import qualified Data.Vector.Unboxed as VU +import qualified DataFrame as D +import qualified DataFrame.Functions as F +import GHC.Stats +import System.Directory (doesFileExist) import System.Environment (getEnv, lookupEnv) -import System.IO (hFlush, stdout, hPutStrLn, stderr) -import Data.Time.Clock.POSIX (getPOSIXTime) -import Control.Exception (evaluate) +import System.IO (hFlush, hPutStrLn, stderr, stdout) +import System.Posix.Process (getProcessID) import System.Process (readProcess) -import System.Directory (doesFileExist) -import Data.List (intercalate) - --- Helper functions for logging -writeLog :: String -> String -> Int -> String -> Int -> Int -> String -> String -> String -> String -> Int -> Double -> Double -> String -> String -> Double -> String -> String -> IO () -writeLog task dataName inRows question outRows outCols solution version git fun run timeSec memGb cache chk chkTimeSec onDisk machineType = do - batch <- lookupEnv "BATCH" >>= return . maybe "" id - timestamp <- getPOSIXTime - csvFile <- lookupEnv "CSV_TIME_FILE" >>= return . maybe "time.csv" id - nodename <- fmap init (readProcess "hostname" [] "") - - let comment = "" - let timeSecRound = roundTo 3 timeSec - let chkTimeSecRound = roundTo 3 chkTimeSec - let memGbRound = roundTo 3 memGb - - let logRow = intercalate "," [ - nodename, batch, show timestamp, task, dataName, show inRows, - question, show outRows, show outCols, solution, version, git, fun, - show run, show timeSecRound, show memGbRound, cache, chk, - show chkTimeSecRound, comment, onDisk, machineType - ] - - fileExists <- doesFileExist csvFile - if fileExists - then appendFile csvFile (logRow ++ "\n") - else do - let header = "nodename,batch,timestamp,task,data,in_rows,question,out_rows,out_cols,solution,version,git,fun,run,time_sec,mem_gb,cache,chk,chk_time_sec,comment,on_disk,machine_type\n" - writeFile csvFile (header ++ logRow ++ "\n") +import Text.Read -roundTo :: Int -> Double -> Double -roundTo n x = (fromInteger $ round $ x * (10^n)) / (10.0^^n) - -makeChk :: [Double] -> String -makeChk values = intercalate ";" (map formatVal values) - where - formatVal x = map (\c -> if c == ',' then '_' else c) (show $ roundTo 3 x) - -getMemoryUsage :: IO Double -getMemoryUsage = do - pid <- fmap init (readProcess "bash" ["-c", "echo $$"] "") - mem <- fmap (filter (/= ' ') . init) (readProcess "ps" ["-o", "rss=", "-p", pid] "") - let rssKb = if null mem then 0 else read mem :: Double - return (rssKb / (1024 * 1024)) - -timeIt :: IO a -> IO (a, Double) -timeIt action = do - start <- getPOSIXTime - result <- action - _ <- evaluate result - end <- getPOSIXTime - return (result, realToFrac (end - start)) main :: IO () main = do @@ -78,7 +34,7 @@ main = do dataName <- getEnv "SRC_DATANAME" machineType <- getEnv "MACHINE_TYPE" - let srcFile = "data/" ++ dataName ++ ".csv" + let srcFile = "../data/" ++ dataName ++ ".csv" putStrLn $ "loading dataset " ++ dataName hFlush stdout @@ -106,207 +62,704 @@ main = do let question1 = "sum v1 by id1" (ans1, t1_1) <- timeIt $ do let grouped = D.groupBy ["id1"] x - let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + let result = D.aggregate [F.sum (F.col @Int "v1") `F.as` "v1_sum"] grouped return result m1_1 <- getMemoryUsage let (outRows1, outCols1) = D.dimensions ans1 (chk1, chkt1_1) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans1 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate sumV1 + let sumV1 = D.sum (F.col @Int "v1_sum") ans1 + print sumV1 return sumV1 - writeLog task dataName inRows question1 outRows1 outCols1 solution ver git fun 1 t1_1 m1_1 cache (makeChk [chk1]) chkt1_1 onDisk machineType + writeLog + task + dataName + inRows + question1 + outRows1 + outCols1 + solution + ver + git + fun + 1 + t1_1 + m1_1 + cache + (makeChk [fromIntegral chk1]) + chkt1_1 + onDisk + machineType -- Run 2 (ans1_2, t1_2) <- timeIt $ do let grouped = D.groupBy ["id1"] x - let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + let result = D.aggregate [F.sum (F.col @Int "v1") `F.as` "v1_sum"] grouped return result m1_2 <- getMemoryUsage let (outRows1_2, outCols1_2) = D.dimensions ans1_2 (chk1_2, chkt1_2) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans1_2 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate sumV1 + let sumV1 = D.sum (F.col @Int "v1_sum") ans1_2 + print sumV1 return sumV1 - writeLog task dataName inRows question1 outRows1_2 outCols1_2 solution ver git fun 2 t1_2 m1_2 cache (makeChk [chk1_2]) chkt1_2 onDisk machineType + writeLog + task + dataName + inRows + question1 + outRows1_2 + outCols1_2 + solution + ver + git + fun + 2 + t1_2 + m1_2 + cache + (makeChk [fromIntegral chk1_2]) + chkt1_2 + onDisk + machineType putStrLn $ "Question 1 completed: " ++ show outRows1_2 ++ " groups" -- Question 2: sum v1 by id1:id2 let question2 = "sum v1 by id1:id2" (ans2, t2_1) <- timeIt $ do let grouped = D.groupBy ["id1", "id2"] x - let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + let result = D.aggregate [F.sum (F.col @Int "v1") `F.as` "v1_sum"] grouped return result m2_1 <- getMemoryUsage let (outRows2, outCols2) = D.dimensions ans2 (chk2, chkt2_1) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans2 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate sumV1 + let sumV1 = D.sum (F.col @Int "v1_sum") ans2 + print sumV1 return sumV1 - writeLog task dataName inRows question2 outRows2 outCols2 solution ver git fun 1 t2_1 m2_1 cache (makeChk [chk2]) chkt2_1 onDisk machineType + writeLog + task + dataName + inRows + question2 + outRows2 + outCols2 + solution + ver + git + fun + 1 + t2_1 + m2_1 + cache + (makeChk [fromIntegral chk2]) + chkt2_1 + onDisk + machineType -- Run 2 (ans2_2, t2_2) <- timeIt $ do let grouped = D.groupBy ["id1", "id2"] x - let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped + let result = D.aggregate [F.sum (F.col @Int "v1") `F.as` "v1_sum"] grouped return result m2_2 <- getMemoryUsage let (outRows2_2, outCols2_2) = D.dimensions ans2_2 (chk2_2, chkt2_2) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans2_2 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate sumV1 + let sumV1 = D.sum (F.col @Int "v1_sum") ans2_2 + print sumV1 return sumV1 - writeLog task dataName inRows question2 outRows2_2 outCols2_2 solution ver git fun 2 t2_2 m2_2 cache (makeChk [chk2_2]) chkt2_2 onDisk machineType + writeLog + task + dataName + inRows + question2 + outRows2_2 + outCols2_2 + solution + ver + git + fun + 2 + t2_2 + m2_2 + cache + (makeChk [fromIntegral chk2_2]) + chkt2_2 + onDisk + machineType putStrLn $ "Question 2 completed: " ++ show outRows2_2 ++ " groups" -- Question 3: sum v1 mean v3 by id3 let question3 = "sum v1 mean v3 by id3" (ans3, t3_1) <- timeIt $ do let grouped = D.groupBy ["id3"] x - let result = D.aggregate - [F.sum (F.col @Double "v1") `F.as` "v1_sum", - F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + let result = + D.aggregate + [ F.sum (F.col @Int "v1") `F.as` "v1_sum" + , F.mean (F.col @Double "v3") `F.as` "v3_mean" + ] + grouped return result m3_1 <- getMemoryUsage let (outRows3, outCols3) = D.dimensions ans3 (chk3, chkt3_1) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans3 of - Right vec -> V.sum vec - Left _ -> 0 - let sumV3 = case D.columnAsDoubleVector "v3_mean" ans3 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate (sumV1, sumV3) + let sumV1 = D.sum (F.col @Int "v1_sum") ans3 + let sumV3 = D.sum (F.col @Double "v3_mean") ans3 + print (sumV1, sumV3) return (sumV1, sumV3) - writeLog task dataName inRows question3 outRows3 outCols3 solution ver git fun 1 t3_1 m3_1 cache (makeChk [fst chk3, snd chk3]) chkt3_1 onDisk machineType + writeLog + task + dataName + inRows + question3 + outRows3 + outCols3 + solution + ver + git + fun + 1 + t3_1 + m3_1 + cache + (makeChk [fromIntegral (fst chk3), snd chk3]) + chkt3_1 + onDisk + machineType -- Run 2 (ans3_2, t3_2) <- timeIt $ do let grouped = D.groupBy ["id3"] x - let result = D.aggregate - [F.sum (F.col @Double "v1") `F.as` "v1_sum", - F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + let result = + D.aggregate + [ F.sum (F.col @Int "v1") `F.as` "v1_sum" + , F.mean (F.col @Double "v3") `F.as` "v3_mean" + ] + grouped return result m3_2 <- getMemoryUsage let (outRows3_2, outCols3_2) = D.dimensions ans3_2 (chk3_2, chkt3_2) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans3_2 of - Right vec -> V.sum vec - Left _ -> 0 - let sumV3 = case D.columnAsDoubleVector "v3_mean" ans3_2 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate (sumV1, sumV3) + let sumV1 = D.sum (F.col @Int "v1_sum") ans3_2 + let sumV3 = D.sum (F.col @Double "v3_mean") ans3_2 + print (sumV1, sumV3) return (sumV1, sumV3) - writeLog task dataName inRows question3 outRows3_2 outCols3_2 solution ver git fun 2 t3_2 m3_2 cache (makeChk [fst chk3_2, snd chk3_2]) chkt3_2 onDisk machineType + writeLog + task + dataName + inRows + question3 + outRows3_2 + outCols3_2 + solution + ver + git + fun + 2 + t3_2 + m3_2 + cache + (makeChk [fromIntegral (fst chk3_2), snd chk3_2]) + chkt3_2 + onDisk + machineType putStrLn $ "Question 3 completed: " ++ show outRows3_2 ++ " groups" -- Question 4: mean v1:v3 by id4 let question4 = "mean v1:v3 by id4" (ans4, t4_1) <- timeIt $ do let grouped = D.groupBy ["id4"] x - let result = D.aggregate - [F.mean (F.col @Double "v1") `F.as` "v1_mean", - F.mean (F.col @Double "v2") `F.as` "v2_mean", - F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + let result = + D.aggregate + [ F.mean (F.col @Int "v1") `F.as` "v1_mean" + , F.mean (F.col @Int "v2") `F.as` "v2_mean" + , F.mean (F.col @Double "v3") `F.as` "v3_mean" + ] + grouped return result m4_1 <- getMemoryUsage let (outRows4, outCols4) = D.dimensions ans4 (chk4, chkt4_1) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1_mean" ans4 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2_mean" ans4 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV3 = case D.columnAsDoubleVector "v3_mean" ans4 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate (sumV1, sumV2, sumV3) + Right vec -> VU.sum vec + Left _ -> 0 + print (sumV1, sumV2, sumV3) return (sumV1, sumV2, sumV3) - writeLog task dataName inRows question4 outRows4 outCols4 solution ver git fun 1 t4_1 m4_1 cache (makeChk [(\(a,_,_) -> a) chk4, (\(_,b,_) -> b) chk4, (\(_,_,c) -> c) chk4]) chkt4_1 onDisk machineType + writeLog + task + dataName + inRows + question4 + outRows4 + outCols4 + solution + ver + git + fun + 1 + t4_1 + m4_1 + cache + (makeChk [(\(a, _, _) -> a) chk4, (\(_, b, _) -> b) chk4, (\(_, _, c) -> c) chk4]) + chkt4_1 + onDisk + machineType -- Run 2 (ans4_2, t4_2) <- timeIt $ do let grouped = D.groupBy ["id4"] x - let result = D.aggregate - [F.mean (F.col @Double "v1") `F.as` "v1_mean", - F.mean (F.col @Double "v2") `F.as` "v2_mean", - F.mean (F.col @Double "v3") `F.as` "v3_mean"] grouped + let result = + D.aggregate + [ F.mean (F.col @Int "v1") `F.as` "v1_mean" + , F.mean (F.col @Int "v2") `F.as` "v2_mean" + , F.mean (F.col @Double "v3") `F.as` "v3_mean" + ] + grouped return result m4_2 <- getMemoryUsage let (outRows4_2, outCols4_2) = D.dimensions ans4_2 (chk4_2, chkt4_2) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1_mean" ans4_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2_mean" ans4_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV3 = case D.columnAsDoubleVector "v3_mean" ans4_2 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate (sumV1, sumV2, sumV3) + Right vec -> VU.sum vec + Left _ -> 0 + print (sumV1, sumV2, sumV3) return (sumV1, sumV2, sumV3) - writeLog task dataName inRows question4 outRows4_2 outCols4_2 solution ver git fun 2 t4_2 m4_2 cache (makeChk [(\(a,_,_) -> a) chk4_2, (\(_,b,_) -> b) chk4_2, (\(_,_,c) -> c) chk4_2]) chkt4_2 onDisk machineType + writeLog + task + dataName + inRows + question4 + outRows4_2 + outCols4_2 + solution + ver + git + fun + 2 + t4_2 + m4_2 + cache + ( makeChk + [(\(a, _, _) -> a) chk4_2, (\(_, b, _) -> b) chk4_2, (\(_, _, c) -> c) chk4_2] + ) + chkt4_2 + onDisk + machineType putStrLn $ "Question 4 completed: " ++ show outRows4_2 ++ " groups" - -- Question 5: sum v1:v3 by id6 + -- Question 6: sum v1:v3 by id6 let question5 = "sum v1:v3 by id6" (ans5, t5_1) <- timeIt $ do let grouped = D.groupBy ["id6"] x - let result = D.aggregate - [F.sum (F.col @Double "v1") `F.as` "v1_sum", - F.sum (F.col @Double "v2") `F.as` "v2_sum", - F.sum (F.col @Double "v3") `F.as` "v3_sum"] grouped + let result = + D.aggregate + [ F.sum (F.col @Int "v1") `F.as` "v1_sum" + , F.sum (F.col @Int "v2") `F.as` "v2_sum" + , F.sum (F.col @Double "v3") `F.as` "v3_sum" + ] + grouped return result m5_1 <- getMemoryUsage let (outRows5, outCols5) = D.dimensions ans5 (chk5, chkt5_1) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans5 of - Right vec -> V.sum vec - Left _ -> 0 - let sumV2 = case D.columnAsDoubleVector "v2_sum" ans5 of - Right vec -> V.sum vec - Left _ -> 0 - let sumV3 = case D.columnAsDoubleVector "v3_sum" ans5 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate (sumV1, sumV2, sumV3) + let sumV1 = D.sum (F.col @Int "v1_sum") ans5 + let sumV2 = D.sum (F.col @Int "v2_sum") ans5 + let sumV3 = D.sum (F.col @Double "v3_sum") ans5 + print (sumV1, sumV2, sumV3) return (sumV1, sumV2, sumV3) - writeLog task dataName inRows question5 outRows5 outCols5 solution ver git fun 1 t5_1 m5_1 cache (makeChk [(\(a,_,_) -> a) chk5, (\(_,b,_) -> b) chk5, (\(_,_,c) -> c) chk5]) chkt5_1 onDisk machineType + writeLog + task + dataName + inRows + question5 + outRows5 + outCols5 + solution + ver + git + fun + 1 + t5_1 + m5_1 + cache + ( makeChk + [ (\(a, _, _) -> fromIntegral a) chk5 + , (\(_, b, _) -> fromIntegral b) chk5 + , (\(_, _, c) -> c) chk5 + ] + ) + chkt5_1 + onDisk + machineType -- Run 2 (ans5_2, t5_2) <- timeIt $ do let grouped = D.groupBy ["id6"] x - let result = D.aggregate - [F.sum (F.col @Double "v1") `F.as` "v1_sum", - F.sum (F.col @Double "v2") `F.as` "v2_sum", - F.sum (F.col @Double "v3") `F.as` "v3_sum"] grouped + let result = + D.aggregate + [ F.sum (F.col @Int "v1") `F.as` "v1_sum" + , F.sum (F.col @Int "v2") `F.as` "v2_sum" + , F.sum (F.col @Double "v3") `F.as` "v3_sum" + ] + grouped return result m5_2 <- getMemoryUsage let (outRows5_2, outCols5_2) = D.dimensions ans5_2 (chk5_2, chkt5_2) <- timeIt $ do - let sumV1 = case D.columnAsDoubleVector "v1_sum" ans5_2 of - Right vec -> V.sum vec - Left _ -> 0 - let sumV2 = case D.columnAsDoubleVector "v2_sum" ans5_2 of - Right vec -> V.sum vec - Left _ -> 0 - let sumV3 = case D.columnAsDoubleVector "v3_sum" ans5_2 of - Right vec -> V.sum vec - Left _ -> 0 - evaluate (sumV1, sumV2, sumV3) + let sumV1 = D.sum (F.col @Int "v1_sum") ans5_2 + let sumV2 = D.sum (F.col @Int "v2_sum") ans5_2 + let sumV3 = D.sum (F.col @Double "v3_sum") ans5_2 + print (sumV1, sumV2, sumV3) return (sumV1, sumV2, sumV3) - writeLog task dataName inRows question5 outRows5_2 outCols5_2 solution ver git fun 2 t5_2 m5_2 cache (makeChk [(\(a,_,_) -> a) chk5_2, (\(_,b,_) -> b) chk5_2, (\(_,_,c) -> c) chk5_2]) chkt5_2 onDisk machineType + writeLog + task + dataName + inRows + question5 + outRows5_2 + outCols5_2 + solution + ver + git + fun + 2 + t5_2 + m5_2 + cache + ( makeChk + [ (\(a, _, _) -> fromIntegral a) chk5_2 + , (\(_, b, _) -> fromIntegral b) chk5_2 + , (\(_, _, c) -> c) chk5_2 + ] + ) + chkt5_2 + onDisk + machineType putStrLn $ "Question 5 completed: " ++ show outRows5_2 ++ " groups" - putStrLn "Haskell dataframe groupby benchmark completed (5 questions implemented)!" - putStrLn "Note: Questions 6-10 would require median, regression, and top-n functions." + -- Question 6: median v3 sd v3 by id4 id5 + let question6 = "median v3 sd v3 by id4 id5" + (ans6, t6_1) <- timeIt $ do + let grouped = D.groupBy ["id4", "id5"] x + let result = + D.aggregate + [ F.median (F.col @Double "v3") `F.as` "v3_median" + , F.stddev (F.col @Double "v3") `F.as` "v3_sd" + ] + grouped + return result + m6_1 <- getMemoryUsage + let (outRows6, outCols6) = D.dimensions ans6 + (chk6, chkt6_1) <- timeIt $ do + let sumMedianV3 = D.sum (F.col @Double "v3_median") ans6 + let sumSdV3 = D.sum (F.col @Double "v3_sd") ans6 + print (sumMedianV3, sumSdV3) + return (sumMedianV3, sumSdV3) + writeLog + task + dataName + inRows + question6 + outRows6 + outCols6 + solution + ver + git + fun + 1 + t6_1 + m6_1 + cache + (makeChk [fst chk6, snd chk6]) + chkt6_1 + onDisk + machineType + + -- Run 2 + (ans6_2, t6_2) <- timeIt $ do + let grouped = D.groupBy ["id4", "id5"] x + let result = + D.aggregate + [ F.median (F.col @Double "v3") `F.as` "v3_median" + , F.stddev (F.col @Double "v3") `F.as` "v3_sd" + ] + grouped + return result + m6_2 <- getMemoryUsage + let (outRows6_2, outCols6_2) = D.dimensions ans6_2 + (chk6_2, chkt6_2) <- timeIt $ do + let sumMedianV3 = D.sum (F.col @Double "v3_median") ans6 + let sumSdV3 = D.sum (F.col @Double "v3_sd") ans6 + print (sumMedianV3, sumSdV3) + return (sumMedianV3, sumSdV3) + writeLog + task + dataName + inRows + question6 + outRows6_2 + outCols6_2 + solution + ver + git + fun + 2 + t6_2 + m6_2 + cache + (makeChk [fst chk6_2, snd chk6_2]) + chkt6_2 + onDisk + machineType + putStrLn $ "Question 6 completed: " ++ show outRows6_2 ++ " groups" + + -- "max v1 - min v2 by id3" + let question7 = "median v3 sd v3 by id4 id5" + (ans7, t7_1) <- timeIt $ do + let grouped = D.groupBy ["id3"] x + let result = + D.aggregate + [(F.maximum (F.col @Int "v1") - F.minimum (F.col @Int "v2")) `F.as` "diff"] + grouped + return result + m7_1 <- getMemoryUsage + let (outRows7, outCols7) = D.dimensions ans7 + (chk7, chkt7_1) <- timeIt $ do + let sumDiff = D.sum (F.col @Int "diff") ans7 + print sumDiff + return sumDiff + writeLog + task + dataName + inRows + question7 + outRows7 + outCols7 + solution + ver + git + fun + 1 + t7_1 + m7_1 + cache + (makeChk [fromIntegral chk7]) + chkt7_1 + onDisk + machineType + + -- Run 2 + (ans7_2, t7_2) <- timeIt $ do + let grouped = D.groupBy ["id3"] x + let result = + D.aggregate + [(F.maximum (F.col @Int "v1") - F.minimum (F.col @Int "v2")) `F.as` "diff"] + grouped + return result + m7_2 <- getMemoryUsage + let (outRows7_2, outCols7_2) = D.dimensions ans7_2 + (chk7_2, chkt7_2) <- timeIt $ do + let sumDiff = D.sum (F.col @Int "diff") ans7 + print sumDiff + return sumDiff + writeLog + task + dataName + inRows + question7 + outRows7_2 + outCols7_2 + solution + ver + git + fun + 2 + t7_2 + m7_2 + cache + (makeChk [fromIntegral chk7_2]) + chkt7_2 + onDisk + machineType + putStrLn $ "Question 7 completed: " ++ show outRows7_2 ++ " groups" + + -- "largest two v3 by id6" + putStrLn "largest two v3 by id6 unimplemented" + + -- "regression v1 v2 by id2 id4" + putStrLn "regression v1 v2 by id2 id4 unimplemented" + + -- "sum v3 count by id1:id6" + let question10 = "sum v3 count by id1:id6" + (ans10, t10_1) <- timeIt $ do + let grouped = + D.groupBy (zipWith (\i n -> i <> (T.pack . show) n) (cycle ["id"]) [1 .. 6]) x + let result = + D.aggregate + [F.sum (F.col @Double "v3") `F.as` "v3_sum"] + grouped + return result + m10_1 <- getMemoryUsage + let (outRows10, outCols10) = D.dimensions ans10 + (chk10, chkt10_1) <- timeIt $ do + let sumV3 = D.sum (F.col @Double "v3_sum") ans10 + print sumV3 + return sumV3 + writeLog + task + dataName + inRows + question10 + outRows10 + outCols10 + solution + ver + git + fun + 1 + t10_1 + m10_1 + cache + (makeChk [chk10]) + chkt10_1 + onDisk + machineType + + -- Run 2 + (ans10_2, t10_2) <- timeIt $ do + let grouped = + D.groupBy (zipWith (\i n -> i <> (T.pack . show) n) (cycle ["id"]) [1 .. 6]) x + let result = + D.aggregate + [F.sum (F.col @Double "v3") `F.as` "v3_sum"] + grouped + return result + m10_2 <- getMemoryUsage + let (outRows10_2, outCols10_2) = D.dimensions ans10_2 + (chk10_2, chkt10_2) <- timeIt $ do + let sumDiff = D.sum (F.col @Double "v3_sum") ans10 + print sumDiff + return sumDiff + writeLog + task + dataName + inRows + question10 + outRows10_2 + outCols10_2 + solution + ver + git + fun + 2 + t10_2 + m10_2 + cache + (makeChk [chk10_2]) + chkt10_2 + onDisk + machineType + putStrLn $ "Question 10 completed: " ++ show outRows7_2 ++ " groups" + + putStrLn + "Haskell dataframe groupby benchmark completed (8 questions implemented)!" + +-- Helper functions for logging +writeLog :: + String -> + String -> + Int -> + String -> + Int -> + Int -> + String -> + String -> + String -> + String -> + Int -> + Double -> + Double -> + String -> + String -> + Double -> + String -> + String -> + IO () +writeLog task dataName inRows question outRows outCols solution version git fun run timeSec memGb cache chk chkTimeSec onDisk machineType = do + batch <- lookupEnv "BATCH" >>= return . maybe "" id + timestamp <- getPOSIXTime + csvFile <- lookupEnv "CSV_TIME_FILE" >>= return . maybe "time.csv" id + nodename <- fmap init (readProcess "hostname" [] "") + + let comment = "" + let timeSecRound = roundTo 3 timeSec + let chkTimeSecRound = roundTo 3 chkTimeSec + let memGbRound = roundTo 3 memGb + + let logRow = + intercalate + "," + [ nodename + , batch + , show timestamp + , task + , dataName + , show inRows + , question + , show outRows + , show outCols + , solution + , version + , git + , fun + , show run + , show timeSecRound + , show memGbRound + , cache + , chk + , show chkTimeSecRound + , comment + , onDisk + , machineType + ] + + fileExists <- doesFileExist csvFile + if fileExists + then appendFile csvFile (logRow ++ "\n") + else do + let header = + "nodename,batch,timestamp,task,data,in_rows,question,out_rows,out_cols,solution,version,git,fun,run,time_sec,mem_gb,cache,chk,chk_time_sec,comment,on_disk,machine_type\n" + writeFile csvFile (header ++ logRow ++ "\n") + +roundTo :: Int -> Double -> Double +roundTo n x = (fromInteger $ round $ x * (10 ^ n)) / (10.0 ^^ n) + +makeChk :: [Double] -> String +makeChk values = intercalate ";" (map formatVal values) + where + formatVal x = map (\c -> if c == ',' then '_' else c) (show $ roundTo 3 x) + +getMemoryUsage :: IO Double +getMemoryUsage = do + pid <- getProcessID + mem <- + fmap (filter (/= ' ') . init) (readProcess "ps" ["-o", "rss", show pid] "") + let rssKb = if null mem then 0 else fromMaybe 0 (readMaybe @Double mem) + return (rssKb / (1024 * 1024)) + +timeIt :: (Show a) => IO a -> IO (a, Double) +timeIt action = do + start <- getPOSIXTime + result <- action + _ <- print result + end <- getPOSIXTime + return (result, realToFrac (end - start)) diff --git a/haskell/haskell-benchmark.cabal b/haskell/haskell-benchmark.cabal index 5775f1a0..5c0a06d9 100644 --- a/haskell/haskell-benchmark.cabal +++ b/haskell/haskell-benchmark.cabal @@ -12,6 +12,7 @@ executable groupby-haskell , time >= 1.9 , process >= 1.6 , directory >= 1.3 + , unix default-language: Haskell2010 ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N @@ -24,5 +25,6 @@ executable join-haskell , time >= 1.9 , process >= 1.6 , directory >= 1.3 + , unix default-language: Haskell2010 ghc-options: -O2 -threaded -rtsopts -with-rtsopts=-N diff --git a/haskell/join-haskell.hs b/haskell/join-haskell.hs index 32d0edcf..57036111 100644 --- a/haskell/join-haskell.hs +++ b/haskell/join-haskell.hs @@ -1,20 +1,40 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeApplications #-} -import qualified DataFrame as D -import qualified DataFrame.Operations.Join as DJ +import Control.Exception (evaluate) +import Data.List (intercalate) import qualified Data.Text as T +import Data.Time.Clock.POSIX (getPOSIXTime) import qualified Data.Vector as V +import qualified Data.Vector.Unboxed as VU +import qualified DataFrame as D +import qualified DataFrame.Operations.Join as DJ +import System.Directory (doesFileExist) import System.Environment (getEnv, lookupEnv) import System.IO (hFlush, stdout) -import Data.Time.Clock.POSIX (getPOSIXTime) -import Control.Exception (evaluate) import System.Process (readProcess) -import System.Directory (doesFileExist) -import Data.List (intercalate) -- Helper functions for logging -writeLog :: String -> String -> Int -> String -> Int -> Int -> String -> String -> String -> String -> Int -> Double -> Double -> String -> String -> Double -> String -> String -> IO () +writeLog :: + String -> + String -> + Int -> + String -> + Int -> + Int -> + String -> + String -> + String -> + String -> + Int -> + Double -> + Double -> + String -> + String -> + Double -> + String -> + String -> + IO () writeLog task dataName inRows question outRows outCols solution version git fun run timeSec memGb cache chk chkTimeSec onDisk machineType = do batch <- lookupEnv "BATCH" >>= return . maybe "" id timestamp <- getPOSIXTime @@ -26,22 +46,43 @@ writeLog task dataName inRows question outRows outCols solution version git fun let chkTimeSecRound = roundTo 3 chkTimeSec let memGbRound = roundTo 3 memGb - let logRow = intercalate "," [ - nodename, batch, show timestamp, task, dataName, show inRows, - question, show outRows, show outCols, solution, version, git, fun, - show run, show timeSecRound, show memGbRound, cache, chk, - show chkTimeSecRound, comment, onDisk, machineType - ] + let logRow = + intercalate + "," + [ nodename + , batch + , show timestamp + , task + , dataName + , show inRows + , question + , show outRows + , show outCols + , solution + , version + , git + , fun + , show run + , show timeSecRound + , show memGbRound + , cache + , chk + , show chkTimeSecRound + , comment + , onDisk + , machineType + ] fileExists <- doesFileExist csvFile if fileExists then appendFile csvFile (logRow ++ "\n") else do - let header = "nodename,batch,timestamp,task,data,in_rows,question,out_rows,out_cols,solution,version,git,fun,run,time_sec,mem_gb,cache,chk,chk_time_sec,comment,on_disk,machine_type\n" + let header = + "nodename,batch,timestamp,task,data,in_rows,question,out_rows,out_cols,solution,version,git,fun,run,time_sec,mem_gb,cache,chk,chk_time_sec,comment,on_disk,machine_type\n" writeFile csvFile (header ++ logRow ++ "\n") roundTo :: Int -> Double -> Double -roundTo n x = (fromInteger $ round $ x * (10^n)) / (10.0^^n) +roundTo n x = (fromInteger $ round $ x * (10 ^ n)) / (10.0 ^^ n) makeChk :: [Double] -> String makeChk values = intercalate ";" (map formatVal values) @@ -50,9 +91,10 @@ makeChk values = intercalate ";" (map formatVal values) getMemoryUsage :: IO Double getMemoryUsage = do - pid <- fmap init (readProcess "bash" ["-c", "echo $$"] "") - mem <- fmap (filter (/= ' ') . init) (readProcess "ps" ["-o", "rss=", "-p", pid] "") - let rssKb = if null mem then 0 else read mem :: Double + pid <- getProcessID + mem <- + fmap (filter (/= ' ') . init) (readProcess "ps" ["-o", "rss", show pid] "") + let rssKb = if null mem then 0 else fromMaybe 0 (readMaybe @Double mem) return (rssKb / (1024 * 1024)) timeIt :: IO a -> IO (a, Double) @@ -72,9 +114,10 @@ joinToTbls dataName = yn1 = show (floor (xn / 1e6) :: Int) ++ "e4" yn2 = show (floor (xn / 1e3) :: Int) ++ "e3" yn3 = show (floor xn :: Int) - in [T.unpack $ T.replace "NA" (T.pack yn1) (T.pack dataName), - T.unpack $ T.replace "NA" (T.pack yn2) (T.pack dataName), - T.unpack $ T.replace "NA" (T.pack yn3) (T.pack dataName)] + in [ T.unpack $ T.replace "NA" (T.pack yn1) (T.pack dataName) + , T.unpack $ T.replace "NA" (T.pack yn2) (T.pack dataName) + , T.unpack $ T.replace "NA" (T.pack yn3) (T.pack dataName) + ] main :: IO () main = do @@ -93,13 +136,22 @@ main = do machineType <- getEnv "MACHINE_TYPE" let yDataNames = joinToTbls dataName - let srcJnX = "data/" ++ dataName ++ ".csv" - let srcJnY = ["data/" ++ yDataNames !! 0 ++ ".csv", - "data/" ++ yDataNames !! 1 ++ ".csv", - "data/" ++ yDataNames !! 2 ++ ".csv"] + let srcJnX = "../data/" ++ dataName ++ ".csv" + let srcJnY = + [ "../data/" ++ yDataNames !! 0 ++ ".csv" + , "../data/" ++ yDataNames !! 1 ++ ".csv" + , "../data/" ++ yDataNames !! 2 ++ ".csv" + ] - putStrLn $ "loading datasets " ++ dataName ++ ", " ++ - yDataNames !! 0 ++ ", " ++ yDataNames !! 1 ++ ", " ++ yDataNames !! 2 + putStrLn $ + "loading datasets " + ++ dataName + ++ ", " + ++ yDataNames !! 0 + ++ ", " + ++ yDataNames !! 1 + ++ ", " + ++ yDataNames !! 2 hFlush stdout -- Load all datasets using dataframe @@ -131,14 +183,32 @@ main = do let (outRows1, outCols1) = D.dimensions ans1 (chk1, chkt1_1) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans1 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans1 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question1 outRows1 outCols1 solution ver git fun 1 t1_1 m1_1 cache (makeChk [fst chk1, snd chk1]) chkt1_1 onDisk machineType + writeLog + task + dataName + xRows + question1 + outRows1 + outCols1 + solution + ver + git + fun + 1 + t1_1 + m1_1 + cache + (makeChk [fst chk1, snd chk1]) + chkt1_1 + onDisk + machineType -- Run 2 (ans1_2, t1_2) <- timeIt $ do @@ -148,14 +218,32 @@ main = do let (outRows1_2, outCols1_2) = D.dimensions ans1_2 (chk1_2, chkt1_2) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans1_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans1_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question1 outRows1_2 outCols1_2 solution ver git fun 2 t1_2 m1_2 cache (makeChk [fst chk1_2, snd chk1_2]) chkt1_2 onDisk machineType + writeLog + task + dataName + xRows + question1 + outRows1_2 + outCols1_2 + solution + ver + git + fun + 2 + t1_2 + m1_2 + cache + (makeChk [fst chk1_2, snd chk1_2]) + chkt1_2 + onDisk + machineType putStrLn $ "Question 1 completed: " ++ show outRows1_2 ++ " rows" -- Question 2: medium inner on int @@ -167,14 +255,32 @@ main = do let (outRows2, outCols2) = D.dimensions ans2 (chk2, chkt2_1) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question2 outRows2 outCols2 solution ver git fun 1 t2_1 m2_1 cache (makeChk [fst chk2, snd chk2]) chkt2_1 onDisk machineType + writeLog + task + dataName + xRows + question2 + outRows2 + outCols2 + solution + ver + git + fun + 1 + t2_1 + m2_1 + cache + (makeChk [fst chk2, snd chk2]) + chkt2_1 + onDisk + machineType -- Run 2 (ans2_2, t2_2) <- timeIt $ do @@ -184,14 +290,32 @@ main = do let (outRows2_2, outCols2_2) = D.dimensions ans2_2 (chk2_2, chkt2_2) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans2_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans2_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question2 outRows2_2 outCols2_2 solution ver git fun 2 t2_2 m2_2 cache (makeChk [fst chk2_2, snd chk2_2]) chkt2_2 onDisk machineType + writeLog + task + dataName + xRows + question2 + outRows2_2 + outCols2_2 + solution + ver + git + fun + 2 + t2_2 + m2_2 + cache + (makeChk [fst chk2_2, snd chk2_2]) + chkt2_2 + onDisk + machineType putStrLn $ "Question 2 completed: " ++ show outRows2_2 ++ " rows" -- Question 3: medium outer on int @@ -203,14 +327,32 @@ main = do let (outRows3, outCols3) = D.dimensions ans3 (chk3, chkt3_1) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans3 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans3 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question3 outRows3 outCols3 solution ver git fun 1 t3_1 m3_1 cache (makeChk [fst chk3, snd chk3]) chkt3_1 onDisk machineType + writeLog + task + dataName + xRows + question3 + outRows3 + outCols3 + solution + ver + git + fun + 1 + t3_1 + m3_1 + cache + (makeChk [fst chk3, snd chk3]) + chkt3_1 + onDisk + machineType -- Run 2 (ans3_2, t3_2) <- timeIt $ do @@ -220,14 +362,32 @@ main = do let (outRows3_2, outCols3_2) = D.dimensions ans3_2 (chk3_2, chkt3_2) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans3_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans3_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question3 outRows3_2 outCols3_2 solution ver git fun 2 t3_2 m3_2 cache (makeChk [fst chk3_2, snd chk3_2]) chkt3_2 onDisk machineType + writeLog + task + dataName + xRows + question3 + outRows3_2 + outCols3_2 + solution + ver + git + fun + 2 + t3_2 + m3_2 + cache + (makeChk [fst chk3_2, snd chk3_2]) + chkt3_2 + onDisk + machineType putStrLn $ "Question 3 completed: " ++ show outRows3_2 ++ " rows" -- Question 4: medium inner on factor @@ -239,14 +399,32 @@ main = do let (outRows4, outCols4) = D.dimensions ans4 (chk4, chkt4_1) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans4 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans4 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question4 outRows4 outCols4 solution ver git fun 1 t4_1 m4_1 cache (makeChk [fst chk4, snd chk4]) chkt4_1 onDisk machineType + writeLog + task + dataName + xRows + question4 + outRows4 + outCols4 + solution + ver + git + fun + 1 + t4_1 + m4_1 + cache + (makeChk [fst chk4, snd chk4]) + chkt4_1 + onDisk + machineType -- Run 2 (ans4_2, t4_2) <- timeIt $ do @@ -256,14 +434,32 @@ main = do let (outRows4_2, outCols4_2) = D.dimensions ans4_2 (chk4_2, chkt4_2) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans4_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans4_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question4 outRows4_2 outCols4_2 solution ver git fun 2 t4_2 m4_2 cache (makeChk [fst chk4_2, snd chk4_2]) chkt4_2 onDisk machineType + writeLog + task + dataName + xRows + question4 + outRows4_2 + outCols4_2 + solution + ver + git + fun + 2 + t4_2 + m4_2 + cache + (makeChk [fst chk4_2, snd chk4_2]) + chkt4_2 + onDisk + machineType putStrLn $ "Question 4 completed: " ++ show outRows4_2 ++ " rows" -- Question 5: big inner on int @@ -275,14 +471,32 @@ main = do let (outRows5, outCols5) = D.dimensions ans5 (chk5, chkt5_1) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans5 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans5 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question5 outRows5 outCols5 solution ver git fun 1 t5_1 m5_1 cache (makeChk [fst chk5, snd chk5]) chkt5_1 onDisk machineType + writeLog + task + dataName + xRows + question5 + outRows5 + outCols5 + solution + ver + git + fun + 1 + t5_1 + m5_1 + cache + (makeChk [fst chk5, snd chk5]) + chkt5_1 + onDisk + machineType -- Run 2 (ans5_2, t5_2) <- timeIt $ do @@ -292,14 +506,32 @@ main = do let (outRows5_2, outCols5_2) = D.dimensions ans5_2 (chk5_2, chkt5_2) <- timeIt $ do let sumV1 = case D.columnAsDoubleVector "v1" ans5_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 let sumV2 = case D.columnAsDoubleVector "v2" ans5_2 of - Right vec -> V.sum vec - Left _ -> 0 + Right vec -> VU.sum vec + Left _ -> 0 evaluate (sumV1, sumV2) return (sumV1, sumV2) - writeLog task dataName xRows question5 outRows5_2 outCols5_2 solution ver git fun 2 t5_2 m5_2 cache (makeChk [fst chk5_2, snd chk5_2]) chkt5_2 onDisk machineType + writeLog + task + dataName + xRows + question5 + outRows5_2 + outCols5_2 + solution + ver + git + fun + 2 + t5_2 + m5_2 + cache + (makeChk [fst chk5_2, snd chk5_2]) + chkt5_2 + onDisk + machineType putStrLn $ "Question 5 completed: " ++ show outRows5_2 ++ " rows" putStrLn "Haskell dataframe join benchmark completed (5 questions implemented)!" diff --git a/haskell/setup-haskell.sh b/haskell/setup-haskell.sh index 391e93e1..be35eaf7 100755 --- a/haskell/setup-haskell.sh +++ b/haskell/setup-haskell.sh @@ -16,8 +16,8 @@ fi # Install dependencies and build stack setup -stack build --only-dependencies -stack build +stack build --only-dependencies --ghc-options "-O2" +stack build --ghc-options "-O2" cd .. diff --git a/haskell/stack.yaml b/haskell/stack.yaml new file mode 100644 index 00000000..95f46dd4 --- /dev/null +++ b/haskell/stack.yaml @@ -0,0 +1,67 @@ +# This file was automatically generated by 'stack init' +# +# Some commonly used options have been documented as comments in this file. +# For advanced use and comprehensive documentation of the format, please see: +# https://docs.haskellstack.org/en/stable/configure/yaml/ + +# A 'specific' Stackage snapshot or a compiler version. +# A snapshot resolver dictates the compiler version and the set of packages +# to be used for project dependencies. For example: +# +# snapshot: lts-23.0 +# snapshot: nightly-2024-12-13 +# snapshot: ghc-9.8.4 +# +# The location of a snapshot can be provided as a file or url. Stack assumes +# a snapshot provided as a file might change, whereas a url resource does not. +# +# snapshot: ./custom-snapshot.yaml +# snapshot: https://example.com/snapshots/2024-01-01.yaml +snapshot: + url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/nightly/2025/11/19.yaml + +# User packages to be built. +# Various formats can be used as shown in the example below. +# +# packages: +# - some-directory +# - https://example.com/foo/bar/baz-0.0.2.tar.gz +# subdirs: +# - auto-update +# - wai +packages: +- . +# Dependency packages to be pulled from upstream that are not in the snapshot. +# These entries can reference officially published versions as well as +# forks / in-progress versions pinned to a git hash. For example: +# +# extra-deps: +# - acme-missiles-0.3 +# - git: https://github.com/commercialhaskell/stack.git +# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a +# +# extra-deps: [] + +# Override default flag values for project packages and extra-deps +# flags: {} + +# Extra package databases containing global packages +# extra-package-dbs: [] + +# Control whether we use the GHC we find on the path +# system-ghc: true +# +# Require a specific version of Stack, using version ranges +# require-stack-version: -any # Default +# require-stack-version: ">=3.3" +# +# Override the architecture used by Stack, especially useful on Windows +# arch: i386 +# arch: x86_64 +# +# Extra directories used by Stack for building +# extra-include-dirs: [/path/to/dir] +# extra-lib-dirs: [/path/to/dir] +# +# Allow a newer minor version of GHC than the snapshot specifies +# compiler-check: newer-minor diff --git a/run.conf b/run.conf index 0aa2cf69..641a49f5 100644 --- a/run.conf +++ b/run.conf @@ -1,7 +1,7 @@ # task, used in init-setup-iteration.R export RUN_TASKS="groupby join" # solution, used in init-setup-iteration.R -export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars R-arrow duckdb duckdb-latest datafusion chdb" +export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars R-arrow duckdb duckdb-latest datafusion chdb haskell" # flag to upgrade tools, used in run.sh on init export DO_UPGRADE=false diff --git a/run.sh b/run.sh index f17a5644..c2408bb2 100755 --- a/run.sh +++ b/run.sh @@ -92,6 +92,8 @@ if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb-latest" ]]; then ./du if [[ "$RUN_SOLUTIONS" == "duckdb-latest" ]]; then ./duckdb-latest/ver-duckdb-latest.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "datafusion" ]]; then ./datafusion/upg-datafusion.sh; fi; if [[ "$RUN_SOLUTIONS" =~ "datafusion" ]]; then ./datafusion/ver-datafusion.sh; fi; +if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "haskell" ]]; then ./haskell/upg-haskell.sh; fi; +if [[ "$RUN_SOLUTIONS" =~ "haskell" ]]; then ./haskell/ver-haskell.sh; fi; # run if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH has been interrupted after $(($(date +%s)-$BATCH))s due to 'stop' file" && rm -f ./stop && rm -f ./run.lock && exit; fi;