mchav · mchav · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -17,7 +17,7 @@ jobs:
   strategy:
     fail-fast: false
     matrix:
-      solution: [data.table, collapse, dplyr, pandas, spark, polars, R-arrow, duckdb, datafusion, dask, clickhouse, chdb]
+      solution: [data.table, collapse, dplyr, pandas, spark, polars, R-arrow, duckdb, datafusion, dask, clickhouse, chdb, haskell]
   name: Solo solutions
   runs-on: ubuntu-latest
   env:

diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,5 @@ workdir/
 timeout-exit-codes.out
 */target
 *.lock
+dist-newstyle
+.stack-work
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@ Contribution and feedback are very welcome!
   - [x] [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl)
   - [x] [In Memory DataSets](https://github.com/sl-solution/InMemoryDatasets.jl)
   - [x] [Datafusion](https://github.com/apache/arrow-datafusion)
+  - [x] [(haskell)dataframe](https://github.com/mchav/dataframe)
 
 If you would like your solution to be included, feel free to file a PR with the necessary setup-_solution_/ver-_solution_/groupby-_solution_/join-_solution_ scripts. If the team at DuckDB Labs approves the PR it will be merged. In the interest of transparency and fairness, only results from open-source data-science tools will be merged.
 

diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
@@ -46,7 +46,8 @@ solution.dict = {list(
   "duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")),
   "duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")),
   "datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3")),
-  "chdb" = list(name=c(short="chdb", long="chDB"), color=c(strong="hotpink4", light="hotpink1"))
+  "chdb" = list(name=c(short="chdb", long="chDB"), color=c(strong="hotpink4", light="hotpink1")),
+  "haskell" = list(name=c(short="haskell", long="Haskell"), color=c(strong="#3d0569ff", light="#61298bff")),
 )}
 #barplot(rep(c(0L,1L,1L), length(solution.dict)),
 #        col=rev(c(rbind(sapply(solution.dict, `[[`, "color"), "black"))),
@@ -259,7 +260,19 @@ groupby.syntax.dict = {list(
     "largest two v3 by id6" = "SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM db_benchmark.x WHERE v3 IS NOT NULL) AS subq GROUP BY id6",
     "regression v1 v2 by id2 id4" = "SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM db_benchmark.x GROUP BY id2, id4",
     "sum v3 count by id1:id6" = "SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM db_benchmark.x GROUP BY id1, id2, id3, id4, id5, id6"
-  )}
+  )},
+  "haskell" = {c(
+    "sum v1 by id1" = "df |> D.groupby [\"id1\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\")]",
+    "sum v1 by id1:id2" = "df |> D.groupby [\"id1\", \"id2\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\")]",
+    "sum v1 mean v3 by id3" = "df |> D.groupby [\"id3\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\"), \"v3_mean\" .= F.mean (F.col @Double \"v3\")]",
+    "mean v1:v3 by id4" = "df |> D.groupby [\"id4\"] |> D.aggregate [\"v1_mean\" .= F.mean (F.col @Int \"v1\"), \"v2_mean\" .= F.mean (F.col @Int \"v2\"), \"v3_mean\" .= F.mean (F.col @Double \"v3\")]",
+    "sum v1:v3 by id6" = "df |> D.groupby [\"id6\"] |> D.aggregate [\"v1_sum\" .= F.sum (F.col @Int \"v1\"), \"v2_sum\" .= F.sum (F.col @Int \"v2\"), \"v3_sum\" .= F.sum (F.col @Double \"v3\")]",
+    "median v3 sd v3 by id4 id5" = "df |> D.groupby [\"id4\", \"id5\"] |> D.aggregate [\"v3_median\" .= F.median (F.col @Doublee \"v3\"), \"v3_sd\" .= F.stddev (F.col @Double \"v3\")]",
+    "max v1 - min v2 by id3" = "df |> D.groupby [\"id3\"] |> D.aggregate [\"diff\" .= F.maximum (F.col @Int \"v1\") - F.minimum (F.col @Int \"v2\")]",
+    "largest two v3 by id6" = "",
+    "regression v1 v2 by id2 id4" = "",
+    "sum v3 count by id1:id6" = "df |> D.groupBy [\"id1\",\"id2\",\"id3\",\"id4\",\"id5\",\"id6\"]).agg([F.sum (F.col @Double \"v3\") `F.as` \"v3\", F..count (F.col @Int \"v1\") `F.as` \"count\"]"
+  )},
 )}
  groupby.query.exceptions = {list(
   "collapse" =    list(),
@@ -277,7 +290,8 @@ groupby.syntax.dict = {list(
   "duckdb"     =  list(),
   "duckdb-latest"     =  list(),
   "datafusion" =  list(),
-  "chdb" =  list()
+  "chdb" =  list(),
+  "haskell" = list()
 )}
 groupby.data.exceptions = {list(                                                             # exceptions as of run 1575727624
   "collapse" = {list(
@@ -348,6 +362,8 @@ groupby.data.exceptions = {list(
     "Not Tested" = c("G1_1e9_1e2_0_0")
   )},
   "chdb" = {list(
+  )},
+  "haskell" = {list(
   )}
 )}
 groupby.exceptions = task.exceptions(groupby.query.exceptions, groupby.data.exceptions)
@@ -472,7 +488,14 @@ join.syntax.dict = {list(
     "medium outer on int" = "SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 as medium_id5, v2 FROM db_benchmark.x AS x LEFT JOIN db_benchmark.medium AS medium USING (id2)",
     "medium inner on factor" = "SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 as medium_id4, v2 FROM db_benchmark.x AS x INNER JOIN db_benchmark.medium AS medium USING (id5)",
     "big inner on int" = "SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 as big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM db_benchmark.x AS x INNER JOIN db_benchmark.big AS big USING (id3)"
-  )}
+  )},
+  "haskell" = {c(
+    "small inner on int" = "D.innerJoin [\"id1\"] small small",
+    "medium inner on int" = "D.innerJoin [\"id2\"] medium medium",
+    "medium outer on int" = "D.leftJoin [\"id2\"] medium medium",
+    "medium inner on factor" = "D.innerJoin [\"id5\"] medium medium",
+    "big inner on int" = "D.innerJoin [\"id3\"] big big"
+  )},
 )}
 join.query.exceptions = {list(
   "collapse" =    list(),
@@ -490,7 +513,8 @@ join.query.exceptions = {list(
   "duckdb"     =  list(),
   "duckdb-latest"     =  list(),
   "datafusion" =  list(),
-  "chdb" =  list()
+  "chdb" =  list(),
+  "haskell" = list()
 )}
 join.data.exceptions = {list(                                                             # exceptions as of run 1575727624
   "collapse" = {list(
@@ -550,6 +574,8 @@ join.data.exceptions = {list(
     "Not tested" = c("J1_1e9_NA_0_0")
   )},
   "chdb" = {list(
+  )},
+  "haskell" = {list(
   )}
 )}
 join.exceptions = task.exceptions(join.query.exceptions, join.data.exceptions)

diff --git a/_control/solutions.csv b/_control/solutions.csv
@@ -33,3 +33,5 @@ datafusion,groupby
 datafusion,join
 chdb,groupby
 chdb,join
+haskell,groupby
+haskell,join
diff --git a/_launcher/launcher.R b/_launcher/launcher.R
@@ -16,7 +16,7 @@ file.ext = function(x) {
     x,
     "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
     "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
-    "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py"
+    "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py", "haskell"="hs",
   )
   if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x))
   ans

diff --git a/_launcher/solution.R b/_launcher/solution.R
@@ -112,7 +112,7 @@ file.ext = function(x) {
     x,
     "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
     "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
-    "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py"
+    "clickhouse"="sh", "juliadf"="jl", "juliads"="jl", "chdb"="py", "haskell"="hs",
   )
   if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x))
   ans
@@ -153,7 +153,7 @@ setenv("SRC_DATANAME", d)
 
 ns = solution.path(s)
 ext = file.ext(s)
-localcmd = if (s %in% c("clickhouse","h2o","juliadf", "juliads")) { # custom launcher bash script, for clickhouse h2o juliadf
+localcmd = if (s %in% c("clickhouse","h2o","juliadf", "juliads", "haskell")) { # custom launcher bash script, for clickhouse h2o juliadf
   sprintf("exec.sh %s", t)
 } else if (s %in% c("dask")) {
   sprintf("%s_%s.%s", t, ns, ext)

diff --git a/_report/report.R b/_report/report.R
@@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) {
   file.path(path, "report-done")
 }
 get_report_solutions = function() {
-  c("duckdb-latest", "collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "datafusion", "arrow", "R-arrow", "chdb")
+  c("duckdb-latest", "collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "datafusion", "arrow", "R-arrow", "chdb", "haskell")
 }
 get_data_levels = function() {
   ## groupby

diff --git a/haskell/README.md b/haskell/README.md
@@ -0,0 +1,85 @@
+# Haskell DataFrame Benchmark
+
+This benchmark entry uses Haskell with the `mchav/dataframe` library to implement dataframe operations.
+
+## Implementation Details
+
+- **Language**: Haskell (GHC)
+- **DataFrame Library**: [mchav/dataframe](https://github.com/mchav/dataframe)
+- **Build Tool**: Stack
+
+## About mchav/dataframe
+
+The `dataframe` library is a fast, safe, and intuitive DataFrame library for Haskell that provides:
+- Type-safe column operations
+- Familiar operations for users coming from pandas, dplyr, or polars
+- Concise, declarative, and composable data pipelines
+- Static typing that catches many bugs at compile time
+
+Resources:
+- GitHub: https://github.com/mchav/dataframe
+- Hackage: https://hackage.haskell.org/package/dataframe
+- Documentation: https://dataframe.readthedocs.io/
+
+## Implemented Benchmarks
+
+### Groupby (`groupby-haskell.hs`)
+Implements 5 out of 10 groupby questions:
+1. sum v1 by id1
+2. sum v1 by id1:id2
+3. sum v1 mean v3 by id3
+4. mean v1:v3 by id4
+5. sum v1:v3 by id6
+
+Uses `D.groupBy` and `D.aggregate` with expression DSL (`F.sum`, `F.mean`).
+
+Note: Questions 6-10 would require additional statistical functions (median, standard deviation, regression, top-n selection).
+
+### Join (`join-haskell.hs`)
+Implements all 5 join questions:
+1. small inner on int
+2. medium inner on int
+3. medium outer on int (using leftJoin)
+4. medium inner on factor
+5. big inner on int
+
+Uses `DJ.innerJoin` and `DJ.leftJoin` from `DataFrame.Operations.Join`.
+
+## Setup
+
+Run the setup script to install dependencies:
+```bash
+./haskell/setup-haskell.sh
+```
+
+This will:
+1. Install Stack (if not present)
+2. Initialize the Stack project
+3. Build all dependencies
+4. Compile the benchmark executables
+
+## API Usage Examples
+
+```haskell
+-- Read CSV
+df <- D.readCsv "data/file.csv"
+
+-- GroupBy with aggregation
+let grouped = D.groupBy ["id1"] df
+let result = D.aggregate [F.sum (F.col @Double "v1") `F.as` "v1_sum"] grouped
+
+-- Inner Join
+let joined = DJ.innerJoin ["id1"] df1 df2
+
+-- Get dimensions
+let (rows, cols) = D.dimensions df
+```
+
+## Performance Notes
+
+The implementation uses:
+- Type-safe column operations with `TypeApplications`
+- Expression DSL for clean aggregation syntax
+- Efficient grouping and joining operations from the dataframe library
+
+This benchmark demonstrates Haskell's capabilities for high-performance dataframe operations with the additional benefits of static typing and functional programming.
diff --git a/haskell/VERSION b/haskell/VERSION
@@ -0,0 +1 @@
+0.3.3.7
diff --git a/haskell/exec.sh b/haskell/exec.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+cd ./haskell
+
+stack run "$1-haskell"