fileframe is a lightweight, immutable DataFrame library for Go. It provides intuitive data manipulation without the complexity of pandas-like APIs.
- Immutable operations - All methods return new DataFrames, never modify the original
- Multiple file formats - CSV, TSV, LTSV, Parquet, XLSX
- Compression support - gzip, bzip2, xz, zstd
go get github.com/nao1215/fileframe// From file (auto-detects format and compression)
df, err := fileframe.NewDataFrameFromPath("data.csv")
df, err := fileframe.NewDataFrameFromPath("data.csv.gz") // compressed
df, err := fileframe.NewDataFrameFromPath("data.parquet") // Parquet
// From slice of maps
df := fileframe.NewDataFrameFromRecords([]map[string]any{
{"name": "Alice", "age": 30, "city": "Tokyo"},
{"name": "Bob", "age": 25, "city": "Osaka"},
})
// From io.Reader
df, err := fileframe.NewDataFrame(reader, fileframe.CSV)// All operations return NEW DataFrames (immutable)
result := df.
Filter(func(row map[string]any) bool {
age, _ := row["age"].(int64)
return age >= 20
}).
Mutate("adult", func(row map[string]any) any {
return true
}).
Select("name", "age", "adult")grouped, err := df.GroupBy("city")
if err != nil {
log.Fatal(err)
}
totals, err := grouped.Sum("sales") // Returns (*DataFrame, error)
averages, err := grouped.Mean("price") // Returns (*DataFrame, error)
counts := grouped.Count() // Returns *DataFrameerr := df.ToCSV("output.csv")
err := df.ToTSV("output.tsv")
records := df.ToRecords() // []map[string]anyEvery operation returns a new DataFrame. The original is never modified.
original := fileframe.NewDataFrameFromRecords(data)
filtered := original.Filter(fn) // original is unchanged
mutated := filtered.Mutate(...) // filtered is unchangedChain multiple operations for clean, readable code:
result := df.
FillNA(0).
Filter(filterFn).
Mutate("new_col", mutateFn).
Select("col1", "col2", "new_col").
Head(100)Operations that can fail return (*DataFrame, error):
// These return errors
grouped, err := df.GroupBy("category")
sorted, err := df.Sort("price", fileframe.Descending)
joined, err := df.Join(other, opt)
// These never fail (return *DataFrame directly)
filtered := df.Filter(fn)
selected := df.Select("col1", "col2")
head := df.Head(10)// Filter rows
adults := df.Filter(func(row map[string]any) bool {
age, ok := row["age"].(int64)
return ok && age >= 18
})
// Select columns
subset := df.Select("name", "email", "phone")
// Drop columns
cleaned := df.Drop("internal_id", "debug_flag")// Add new column
withTotal := df.Mutate("total", func(row map[string]any) any {
qty, _ := row["quantity"].(int64)
price, _ := row["price"].(float64)
return float64(qty) * price
})
// Rename columns
renamed, err := df.Rename("old_name", "new_name")
renamed, err := df.RenameColumns(map[string]string{
"col1": "column_one",
"col2": "column_two",
})// Sort by single column
sorted, err := df.Sort("price", fileframe.Descending)
// Sort by multiple columns
sorted, err := df.SortBy(
fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
fileframe.SortOption{Column: "price", Order: fileframe.Descending},
)
// Remove duplicates
unique := df.Distinct()
unique := df.DistinctBy("email") // by specific columnfirst10 := df.Head(10)
last5 := df.Tail(5)
limited := df.Limit(100) // alias for Head// Remove rows with nil
cleaned := df.DropNA()
cleaned := df.DropNASubset("required_field")
// Fill nil values
filled := df.FillNA(0)
filled := df.FillNAByColumn(map[string]any{
"name": "Unknown",
"age": 0,
"active": false,
})users := fileframe.NewDataFrameFromRecords([]map[string]any{
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
})
orders := fileframe.NewDataFrameFromRecords([]map[string]any{
{"user_id": 1, "product": "Laptop"},
{"user_id": 1, "product": "Mouse"},
})
// Join types: InnerJoin, LeftJoin, RightJoin, OuterJoin
result, err := users.Join(orders, fileframe.JoinOption{
On: []string{"id", "user_id"}, // left column, right column
How: fileframe.LeftJoin,
})// Same schema
combined, err := df1.Concat(df2, df3)
// Different schemas (union of columns, nil for missing)
combined, err := fileframe.ConcatAll(df1, df2, df3)grouped, err := df.GroupBy("category")
if err != nil {
log.Fatal(err)
}
// Built-in aggregations
counts := grouped.Count() // *DataFrame
sums, err := grouped.Sum("amount") // (*DataFrame, error)
means, err := grouped.Mean("price") // (*DataFrame, error)
mins, err := grouped.Min("value") // (*DataFrame, error)
maxs, err := grouped.Max("value") // (*DataFrame, error)
// Custom aggregation
median, err := grouped.Agg("value", func(values []any) any {
// Your aggregation logic here
return computeMedian(values)
})
// Global aggregation (no grouping)
globalGrouped, _ := df.GroupBy()
totalSum, _ := globalGrouped.Sum("amount")package main
import (
"fmt"
"log"
"github.com/nao1215/fileframe"
)
func main() {
// Load sales data
df, err := fileframe.NewDataFrameFromPath("sales.csv")
if err != nil {
log.Fatal(err)
}
// Process data
result := df.
FillNAByColumn(map[string]any{"salesperson": "Unknown"}).
Filter(func(row map[string]any) bool {
amount, _ := row["amount"].(int64)
return amount > 0
}).
Mutate("revenue", func(row map[string]any) any {
qty, _ := row["quantity"].(int64)
price, _ := row["price"].(int64)
return qty * price
})
// Aggregate by region
grouped, err := result.GroupBy("region")
if err != nil {
log.Fatal(err)
}
byRegion, err := grouped.Sum("revenue")
if err != nil {
log.Fatal(err)
}
// Sort and get top 3
sorted, err := byRegion.Sort("sum_revenue", fileframe.Descending)
if err != nil {
log.Fatal(err)
}
top3 := sorted.Head(3)
// Output results
for _, row := range top3.ToRecords() {
fmt.Printf("%s: %v\n", row["region"], row["sum_revenue"])
}
// Export
if err := top3.ToCSV("top_regions.csv"); err != nil {
log.Fatal(err)
}
}| Format | Read | Write | Compression |
|---|---|---|---|
| CSV | Yes | Yes | gzip, bzip2, xz, zstd |
| TSV | Yes | Yes | gzip, bzip2, xz, zstd |
| LTSV | Yes | - | gzip, bzip2, xz, zstd |
| Parquet | Yes | - | gzip, bzip2, xz, zstd |
| XLSX | Yes | - | gzip, bzip2, xz, zstd |
Benchmarks on AMD RYZEN AI MAX+ 395:
| Operation | 100 rows | 1,000 rows | 10,000 rows |
|---|---|---|---|
| CSV Parse | 140 µs | 1.4 ms | 5.1 ms |
| Filter | 27 µs | 304 µs | 2.9 ms |
| Select | 13 µs | 110 µs | 1.5 ms |
| Mutate | 37 µs | 332 µs | 4.5 ms |
| GroupBy + Sum | 7 µs | 57 µs | 635 µs |
Memory usage (10,000 rows):
- CSV Parse: 8.4 MB
- Filter: 5.2 MB
- GroupBy + Sum: 408 KB
Use fileframe when:
- Working with small to medium datasets (< 100,000 rows)
- Need simple, readable data transformations
- Want immutable operations for predictable code
- Working with multiple file formats
Consider alternatives when:
- Processing very large files (use filesql for streaming)
- Need complex SQL-like queries (use filesql)
- Require lazy evaluation
- nao1215/filesql - SQL driver for CSV, TSV, LTSV, Parquet, XLSX with streaming support
- nao1215/fileprep - Struct-tag preprocessing and validation
Contributions are welcome! Please see the Contributing Guide for details.
If you find this project useful:
- Give it a star on GitHub
- Become a sponsor
MIT License - see LICENSE for details.
