Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: [whisper] Partial support for verbose_json format in transcribe endpoint #721

Merged
merged 1 commit into from
Jul 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/openai.go
Expand Up @@ -737,7 +737,7 @@ func transcriptEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {

log.Debug().Msgf("Trascribed: %+v", tr)
// TODO: handle different outputs here
return c.Status(http.StatusOK).JSON(fiber.Map{"text": tr})
return c.Status(http.StatusOK).JSON(tr)
}
}

Expand Down
45 changes: 34 additions & 11 deletions pkg/whisper/whisper.go
Expand Up @@ -5,11 +5,25 @@ import (
"os"
"os/exec"
"path/filepath"
"time"

"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
wav "github.com/go-audio/wav"
)

type Segment struct {
Id int `json:"id"`
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
}

type Result struct {
Segments []Segment `json:"segments"`
Text string `json:"text"`
}

func sh(c string) (string, error) {
cmd := exec.Command("/bin/sh", "-c", c)
cmd.Env = os.Environ()
Expand All @@ -28,40 +42,41 @@ func audioToWav(src, dst string) error {
return nil
}

func Transcript(model whisper.Model, audiopath, language string, threads uint) (string, error) {
func Transcript(model whisper.Model, audiopath, language string, threads uint) (Result, error) {
res := Result{}

dir, err := os.MkdirTemp("", "whisper")
if err != nil {
return "", err
return res, err
}
defer os.RemoveAll(dir)

convertedPath := filepath.Join(dir, "converted.wav")

if err := audioToWav(audiopath, convertedPath); err != nil {
return "", err
return res, err
}

// Open samples
fh, err := os.Open(convertedPath)
if err != nil {
return "", err
return res, err
}
defer fh.Close()

// Read samples
d := wav.NewDecoder(fh)
buf, err := d.FullPCMBuffer()
if err != nil {
return "", err
return res, err
}

data := buf.AsFloat32Buffer().Data

// Process samples
context, err := model.NewContext()
if err != nil {
return "", err
return res, err

}

Expand All @@ -74,17 +89,25 @@ func Transcript(model whisper.Model, audiopath, language string, threads uint) (
}

if err := context.Process(data, nil, nil); err != nil {
return "", err
return res, err
}

text := ""
for {
segment, err := context.NextSegment()
s, err := context.NextSegment()
if err != nil {
break
}
text += segment.Text

var tokens []int
for _, t := range(s.Tokens) {
tokens = append(tokens, t.Id)
}

segment := Segment{Id: s.Num, Text: s.Text, Start:s.Start, End: s.End, Tokens: tokens}
res.Segments = append(res.Segments, segment)

res.Text += s.Text
}

return text, nil
return res, nil
}