# `go-huggingface` Demo

This demo shows how to download files and create tokenizers from HuggingFace models.

## Imports and `go work` setup

In [2]:
%version
!*rm -f go.work && go work init
!*go work use . "${HOME}/Projects/gomlx" "${HOME}/Projects/go-huggingface" "${HOME}/Projects/gemma" "${HOME}/Projects/onnx-gomlx"
%goworkfix

**GoNB** version [v0.10.6](https://github.com/janpfeifer/gonb/releases/tag/v0.10.6) / Commit: [0e5f587a077810d058202b76a127651a02bd4382](https://github.com/janpfeifer/gonb/tree/0e5f587a077810d058202b76a127651a02bd4382)


	- Added replace rule for module "github.com/gomlx/gomlx" to local directory "/home/janpf/Projects/gomlx".
	- Added replace rule for module "github.com/gomlx/onnx-gomlx" to local directory "/home/janpf/Projects/onnx-gomlx".
	- Added replace rule for module "github.com/gomlx/gemma" to local directory "/home/janpf/Projects/gemma".
	- Added replace rule for module "github.com/gomlx/go-huggingface" to local directory "/home/janpf/Projects/go-huggingface".


In [3]:
import (
    "github.com/janpfeifer/must"
    "github.com/gomlx/go-huggingface/hub"
    "github.com/gomlx/go-huggingface/tokenizers"
)

## Download `tokenizer_config.json` and enumerate `tokenizer_class` for several models

In [6]:
var (
    // HuggingFace authentication token read from environment.
    // Some files may require it for downloading.
    hfAuthToken = os.Getenv("HF_TOKEN")

    // Model ids for testing.
    hfModelIDs = []string{
        "google/gemma-2-2b-it",
        "sentence-transformers/all-MiniLM-L6-v2",
        "protectai/deberta-v3-base-zeroshot-v1-onnx",
        "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english",
        "KnightsAnalytics/distilbert-NER",
        "KnightsAnalytics/all-MiniLM-L6-v2",
        "SamLowe/roberta-base-go_emotions-onnx",
    }
)

In [4]:
%%
for _, modelID := range hfModelIDs {
    fmt.Printf("\n%s:\n", modelID)
    repo := hub.New(modelID).WithAuth(hfAuthToken)
    for fileName, err := range repo.IterFileNames() {
        if err != nil { panic(err) }
        fmt.Printf("\t%s\n", fileName)
    }
}


google/gemma-2-2b-it:
	.gitattributes
	README.md
	config.json
	generation_config.json
	model-00001-of-00002.safetensors
	model-00002-of-00002.safetensors
	model.safetensors.index.json
	special_tokens_map.json
	tokenizer.json
	tokenizer.model
	tokenizer_config.json

sentence-transformers/all-MiniLM-L6-v2:
	.gitattributes
	1_Pooling/config.json
	README.md
	config.json
	config_sentence_transformers.json
	data_config.json
	model.safetensors
	modules.json
	onnx/model.onnx
	onnx/model_O1.onnx
	onnx/model_O2.onnx
	onnx/model_O3.onnx
	onnx/model_O4.onnx
	onnx/model_qint8_arm64.onnx
	onnx/model_qint8_avx512.onnx
	onnx/model_qint8_avx512_vnni.onnx
	onnx/model_quint8_avx2.onnx
	openvino/openvino_model.bin
	openvino/openvino_model.xml
	openvino/openvino_model_qint8_quantized.bin
	openvino/openvino_model_qint8_quantized.xml
	pytorch_model.bin
	rust_model.ot
	sentence_bert_config.json
	special_tokens_map.json
	tf_model.h5
	tokenizer.json
	tokenizer_config.json
	train_script.py
	vocab.txt

protectai

In [5]:
%%
for _, modelID := range hfModelIDs {
    fmt.Printf("\n%s:\n", modelID)
    repo := hub.New(modelID).WithAuth(hfAuthToken)
    config := must.M1(tokenizers.GetConfig(repo))
    fmt.Printf("\ttokenizer_class=%s\n", config.TokenizerClass)
}


google/gemma-2-2b-it:
	tokenizer_class=GemmaTokenizer

sentence-transformers/all-MiniLM-L6-v2:
	tokenizer_class=BertTokenizer

protectai/deberta-v3-base-zeroshot-v1-onnx:
	tokenizer_class=DebertaV2Tokenizer

KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english:
	tokenizer_class=DistilBertTokenizer

KnightsAnalytics/distilbert-NER:
	tokenizer_class=DistilBertTokenizer

KnightsAnalytics/all-MiniLM-L6-v2:
	tokenizer_class=BertTokenizer

SamLowe/roberta-base-go_emotions-onnx:
	tokenizer_class=RobertaTokenizer


## Create a Tokenizer

In [6]:
var sentence = "The book is on the table."

%%
repo := hub.New("google/gemma-2-2b-it").WithAuth(hfAuthToken)
tokenizer := must.M1(tokenizers.New(repo))
tokens := tokenizer.Encode(sentence)
fmt.Printf("Sentence:\t%s\n", sentence)
fmt.Printf("Tokens:  \t%v\n", tokens)


Sentence:	The book is on the table.
Tokens:  	[651 2870 603 611 573 3037 235265]


## Convert ONNX model

In [21]:
import (
    "github.com/gomlx/onnx-gomlx/onnx"
    "github.com/gomlx/gomlx/graph"
    "github.com/gomlx/gomlx/ml/context"
    "github.com/gomlx/gomlx/backends"
    _ "github.com/gomlx/gomlx/backends/xla"
)

%%
// Get ONNX model.
repo := hub.New("sentence-transformers/all-MiniLM-L6-v2").WithAuth(hfAuthToken)
onnxFilePath, err := repo.DownloadFile("onnx/model.onnx")
if err != nil { panic(err) }
onnxModel, err := onnx.ReadFile(onnxFilePath)
if err != nil { panic(err) }

// Convert ONNX variables to GoMLX context (which stores variables):
ctx := context.New()
err = onnxModel.VariablesToContext(ctx)
if err != nil { panic(err) }

sentences := []string{
    "This is an example sentence", 
    "Each sentence is converted"}
inputIDs := [][]int64{
    {101, 2023, 2003, 2019, 2742, 6251,  102},
    { 101, 2169, 6251, 2003, 4991,  102,    0}}
tokenTypeIDs := [][]int64{
    {0, 0, 0, 0, 0, 0, 0},
    {0, 0, 0, 0, 0, 0, 0}}
attentionMask := [][]int64{
    {1, 1, 1, 1, 1, 1, 1},
    {1, 1, 1, 1, 1, 1, 0}}
embeddings := context.ExecOnce(
    backends.New(), ctx, 
    func (ctx *context.Context, inputs []*graph.Node) *graph.Node {
        modelOutputs := onnxModel.CallGraph(ctx, inputs[0].Graph(), map[string]*graph.Node{
            "input_ids": inputs[0],
            "attention_mask": inputs[1],
            "token_type_ids": inputs[2]})
        return modelOutputs[0]
    }, inputIDs, attentionMask, tokenTypeIDs)

fmt.Printf("Sentences: \t%q\n", sentences)
fmt.Printf("Embeddings:\t%s\n", embeddings)


Sentences: 	["This is an example sentence" "Each sentence is converted"]
Embeddings:	[2][7][384]float32{
 {{0.0366, -0.0162, 0.1682, ..., 0.0554, -0.1644, -0.2967},
  {0.7239, 0.6399, 0.1888, ..., 0.5946, 0.6206, 0.4897},
  {0.0064, 0.0203, 0.0448, ..., 0.3464, 1.3170, -0.1670},
  ...,
  {0.1479, -0.0643, 0.1457, ..., 0.8837, -0.3316, 0.2975},
  {0.5212, 0.6563, 0.5607, ..., -0.0399, 0.0412, -1.4036},
  {1.0824, 0.7140, 0.3986, ..., -0.2301, 0.3243, -1.0313}},
 {{0.2802, 0.1165, -0.0418, ..., 0.2711, -0.1685, -0.2961},
  {0.8729, 0.4545, -0.1091, ..., 0.1365, 0.4580, -0.2042},
  {0.4752, 0.5731, 0.6304, ..., 0.6526, 0.5612, -1.3268},
  ...,
  {0.6113, 0.7920, -0.4685, ..., 0.0854, 1.0592, -0.2983},
  {0.4115, 1.0946, 0.2385, ..., 0.8984, 0.3684, -0.7333},
  {0.1374, 0.5555, 0.2678, ..., 0.5426, 0.4665, -0.5284}}}


## Download Dataset Files

We are going to use the [HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) as an example, download one of its sample files (~2.5Gb of data) and parse the `.parquet` file.

### Structure of file
First we define the structure of each entry, with the tags for the Parquet parser:

In [7]:
var (
    FineWebID = "HuggingFaceFW/fineweb"
    FineWebSampleFile = "sample/10BT/000_00000.parquet"
)

// FineWebEntry: inspection of fields in parque file done with tool in 
// github.com/xitongsys/parquet-go/tool/parquet-tools.
//
// The parquet annotations are described in: https://pkg.go.dev/github.com/parquet-go/parquet-go#SchemaOf
type FineWebEntry struct {
    Text string `parquet:"text,snappy"`
    ID string `parquet:"id,snappy"`
    Dump string `parquet:"dump,snappy"`
    URL string `parquet:"url,snappy"`
    Score float64 `parquet:"language_score"`
}

// TrimString returns s trimmed to at most maxLength runes. If trimmed it appends "…" at the end.
func TrimString(s string, maxLength int) string {
    if utf8.RuneCountInString(s) <= maxLength {
        return s
    }
    runes := []rune(s)
    return string(runes[:maxLength-1]) + "…"
}

### Read the Parquet

Using the library [github.com/parquet-go/parquet-go](https://github.com/parquet-go/parquet-go).

In [14]:
import (
    parquet "github.com/parquet-go/parquet-go"
)

%%
// Download repo file.
repo := hub.New(FineWebID).WithType(hub.RepoTypeDataset).WithAuth(hfAuthToken)
localSampleFile := must.M1(repo.DownloadFile(FineWebSampleFile))

// Parquet reading using parquet-go: it's somewhat cumbersome (to open the file it needs its size!?), but it works.
schema := parquet.SchemaOf(&FineWebEntry{})
fSize := must.M1(os.Stat(localSampleFile)).Size()
fReader := must.M1(os.Open(localSampleFile))
fParquet := must.M1(parquet.OpenFile(fReader, fSize))
reader := parquet.NewGenericReader[FineWebEntry](fParquet, schema)
defer reader.Close()

// Print first 10 rows:
rows := make([]FineWebEntry, 10)
n := must.M1(reader.Read(rows))
fmt.Printf("%d rows read\n", n)
for ii, row := range rows {
    fmt.Printf("Row %0d:\tScore=%.3f Text=[%q], URL=[%s]\n", ii, row.Score, TrimString(row.Text, 50), TrimString(row.URL, 40))
}


10 rows read
Row 0:	Score=0.823 Text=["|Viewing Single Post From: Spoilers for the Week …"], URL=[http://daytimeroyaltyonline.com/single/…]
Row 1:	Score=0.974 Text=["*sigh* Fundamentalist community, let me pass on s…"], URL=[http://endogenousretrovirus.blogspot.co…]
Row 2:	Score=0.873 Text=["A novel two-step immunotherapy approach has shown…"], URL=[http://news.cancerconnect.com/]
Row 3:	Score=0.932 Text=["Free the Cans! Working Together to Reduce Waste\nI…"], URL=[http://sharingsolution.com/2009/05/23/f…]
Row 4:	Score=0.955 Text=["ORLANDO, Fla. — While the Rapid Recall Exchange, …"], URL=[http://supermarketnews.com/food-safety/…]
Row 5:	Score=0.954 Text=["September 28, 2010\n2010 Season - Bowman pulls dow…"], URL=[http://www.augustana.edu/x22236.xml]
Row 6:	Score=0.967 Text=["Kraft Foods has taken the Cadbury chocolate brand…"], URL=[http://www.fdin.org.uk/2012/01/kraft-la…]
Row 7:	Score=0.874 Text=["You must be a registered member to view this page…"], URL=[http://www.golivewire.com/