Skip to content

nao1215/fileframe

Go Reference Go Report Card MultiPlatformUnitTest Coverage

fileframe

logo

fileframe is a lightweight, immutable DataFrame library for Go. It provides intuitive data manipulation without the complexity of pandas-like APIs.

Features

  • Immutable operations - All methods return new DataFrames, never modify the original
  • Multiple file formats - CSV, TSV, LTSV, Parquet, XLSX
  • Compression support - gzip, bzip2, xz, zstd

Installation

go get github.com/nao1215/fileframe

Quick Start

1. Load Data

// From file (auto-detects format and compression)
df, err := fileframe.NewDataFrameFromPath("data.csv")
df, err := fileframe.NewDataFrameFromPath("data.csv.gz")  // compressed
df, err := fileframe.NewDataFrameFromPath("data.parquet") // Parquet

// From slice of maps
df := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"name": "Alice", "age": 30, "city": "Tokyo"},
    {"name": "Bob", "age": 25, "city": "Osaka"},
})

// From io.Reader
df, err := fileframe.NewDataFrame(reader, fileframe.CSV)

2. Transform Data

// All operations return NEW DataFrames (immutable)
result := df.
    Filter(func(row map[string]any) bool {
        age, _ := row["age"].(int64)
        return age >= 20
    }).
    Mutate("adult", func(row map[string]any) any {
        return true
    }).
    Select("name", "age", "adult")

3. Aggregate Data

grouped, err := df.GroupBy("city")
if err != nil {
    log.Fatal(err)
}

totals, err := grouped.Sum("sales")    // Returns (*DataFrame, error)
averages, err := grouped.Mean("price") // Returns (*DataFrame, error)
counts := grouped.Count()              // Returns *DataFrame

4. Export Data

err := df.ToCSV("output.csv")
err := df.ToTSV("output.tsv")
records := df.ToRecords() // []map[string]any

Core Concepts

Immutability

Every operation returns a new DataFrame. The original is never modified.

original := fileframe.NewDataFrameFromRecords(data)
filtered := original.Filter(fn)  // original is unchanged
mutated := filtered.Mutate(...)  // filtered is unchanged

Method Chaining

Chain multiple operations for clean, readable code:

result := df.
    FillNA(0).
    Filter(filterFn).
    Mutate("new_col", mutateFn).
    Select("col1", "col2", "new_col").
    Head(100)

Error Handling

Operations that can fail return (*DataFrame, error):

// These return errors
grouped, err := df.GroupBy("category")
sorted, err := df.Sort("price", fileframe.Descending)
joined, err := df.Join(other, opt)

// These never fail (return *DataFrame directly)
filtered := df.Filter(fn)
selected := df.Select("col1", "col2")
head := df.Head(10)

Common Operations

Filtering & Selection

// Filter rows
adults := df.Filter(func(row map[string]any) bool {
    age, ok := row["age"].(int64)
    return ok && age >= 18
})

// Select columns
subset := df.Select("name", "email", "phone")

// Drop columns
cleaned := df.Drop("internal_id", "debug_flag")

Adding & Modifying Columns

// Add new column
withTotal := df.Mutate("total", func(row map[string]any) any {
    qty, _ := row["quantity"].(int64)
    price, _ := row["price"].(float64)
    return float64(qty) * price
})

// Rename columns
renamed, err := df.Rename("old_name", "new_name")
renamed, err := df.RenameColumns(map[string]string{
    "col1": "column_one",
    "col2": "column_two",
})

Sorting & Deduplication

// Sort by single column
sorted, err := df.Sort("price", fileframe.Descending)

// Sort by multiple columns
sorted, err := df.SortBy(
    fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
    fileframe.SortOption{Column: "price", Order: fileframe.Descending},
)

// Remove duplicates
unique := df.Distinct()
unique := df.DistinctBy("email") // by specific column

Row Selection

first10 := df.Head(10)
last5 := df.Tail(5)
limited := df.Limit(100) // alias for Head

Missing Values

// Remove rows with nil
cleaned := df.DropNA()
cleaned := df.DropNASubset("required_field")

// Fill nil values
filled := df.FillNA(0)
filled := df.FillNAByColumn(map[string]any{
    "name":   "Unknown",
    "age":    0,
    "active": false,
})

Joining DataFrames

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"},
})

orders := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"user_id": 1, "product": "Laptop"},
    {"user_id": 1, "product": "Mouse"},
})

// Join types: InnerJoin, LeftJoin, RightJoin, OuterJoin
result, err := users.Join(orders, fileframe.JoinOption{
    On:  []string{"id", "user_id"}, // left column, right column
    How: fileframe.LeftJoin,
})

Concatenating DataFrames

// Same schema
combined, err := df1.Concat(df2, df3)

// Different schemas (union of columns, nil for missing)
combined, err := fileframe.ConcatAll(df1, df2, df3)

GroupBy & Aggregation

grouped, err := df.GroupBy("category")
if err != nil {
    log.Fatal(err)
}

// Built-in aggregations
counts := grouped.Count()              // *DataFrame
sums, err := grouped.Sum("amount")     // (*DataFrame, error)
means, err := grouped.Mean("price")    // (*DataFrame, error)
mins, err := grouped.Min("value")      // (*DataFrame, error)
maxs, err := grouped.Max("value")      // (*DataFrame, error)

// Custom aggregation
median, err := grouped.Agg("value", func(values []any) any {
    // Your aggregation logic here
    return computeMedian(values)
})

// Global aggregation (no grouping)
globalGrouped, _ := df.GroupBy()
totalSum, _ := globalGrouped.Sum("amount")

Complete Example

package main

import (
    "fmt"
    "log"

    "github.com/nao1215/fileframe"
)

func main() {
    // Load sales data
    df, err := fileframe.NewDataFrameFromPath("sales.csv")
    if err != nil {
        log.Fatal(err)
    }

    // Process data
    result := df.
        FillNAByColumn(map[string]any{"salesperson": "Unknown"}).
        Filter(func(row map[string]any) bool {
            amount, _ := row["amount"].(int64)
            return amount > 0
        }).
        Mutate("revenue", func(row map[string]any) any {
            qty, _ := row["quantity"].(int64)
            price, _ := row["price"].(int64)
            return qty * price
        })

    // Aggregate by region
    grouped, err := result.GroupBy("region")
    if err != nil {
        log.Fatal(err)
    }

    byRegion, err := grouped.Sum("revenue")
    if err != nil {
        log.Fatal(err)
    }

    // Sort and get top 3
    sorted, err := byRegion.Sort("sum_revenue", fileframe.Descending)
    if err != nil {
        log.Fatal(err)
    }
    top3 := sorted.Head(3)

    // Output results
    for _, row := range top3.ToRecords() {
        fmt.Printf("%s: %v\n", row["region"], row["sum_revenue"])
    }

    // Export
    if err := top3.ToCSV("top_regions.csv"); err != nil {
        log.Fatal(err)
    }
}

Supported File Formats

Format Read Write Compression
CSV Yes Yes gzip, bzip2, xz, zstd
TSV Yes Yes gzip, bzip2, xz, zstd
LTSV Yes - gzip, bzip2, xz, zstd
Parquet Yes - gzip, bzip2, xz, zstd
XLSX Yes - gzip, bzip2, xz, zstd

Performance

Benchmarks on AMD RYZEN AI MAX+ 395:

Operation 100 rows 1,000 rows 10,000 rows
CSV Parse 140 µs 1.4 ms 5.1 ms
Filter 27 µs 304 µs 2.9 ms
Select 13 µs 110 µs 1.5 ms
Mutate 37 µs 332 µs 4.5 ms
GroupBy + Sum 7 µs 57 µs 635 µs

Memory usage (10,000 rows):

  • CSV Parse: 8.4 MB
  • Filter: 5.2 MB
  • GroupBy + Sum: 408 KB

When to Use fileframe

Use fileframe when:

  • Working with small to medium datasets (< 100,000 rows)
  • Need simple, readable data transformations
  • Want immutable operations for predictable code
  • Working with multiple file formats

Consider alternatives when:

  • Processing very large files (use filesql for streaming)
  • Need complex SQL-like queries (use filesql)
  • Require lazy evaluation

Related Projects

Contributing

Contributions are welcome! Please see the Contributing Guide for details.

Support

If you find this project useful:

License

MIT License - see LICENSE for details.

About

DataFrame API for CSV/TSV/LTSV, Parquet, Excel.

Topics

Resources

License

Code of conduct

Contributing

Security policy

Stars

Watchers

Forks

Sponsor this project

 

Packages

No packages published