Skip to content

Commit

Permalink
feat: [whisper] Partial support for verbose_json format in transcribe…
Browse files Browse the repository at this point in the history
… endpoint (#721)
  • Loading branch information
ldotlopez committed Jul 4, 2023
1 parent f3063f9 commit a6839fd
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 12 deletions.
2 changes: 1 addition & 1 deletion api/openai.go
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ func transcriptEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {

log.Debug().Msgf("Trascribed: %+v", tr)
// TODO: handle different outputs here
return c.Status(http.StatusOK).JSON(fiber.Map{"text": tr})
return c.Status(http.StatusOK).JSON(tr)
}
}

Expand Down
45 changes: 34 additions & 11 deletions pkg/whisper/whisper.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,25 @@ import (
"os"
"os/exec"
"path/filepath"
"time"

"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
wav "github.com/go-audio/wav"
)

type Segment struct {
Id int `json:"id"`
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
}

type Result struct {
Segments []Segment `json:"segments"`
Text string `json:"text"`
}

func sh(c string) (string, error) {
cmd := exec.Command("/bin/sh", "-c", c)
cmd.Env = os.Environ()
Expand All @@ -28,40 +42,41 @@ func audioToWav(src, dst string) error {
return nil
}

func Transcript(model whisper.Model, audiopath, language string, threads uint) (string, error) {
func Transcript(model whisper.Model, audiopath, language string, threads uint) (Result, error) {
res := Result{}

dir, err := os.MkdirTemp("", "whisper")
if err != nil {
return "", err
return res, err
}
defer os.RemoveAll(dir)

convertedPath := filepath.Join(dir, "converted.wav")

if err := audioToWav(audiopath, convertedPath); err != nil {
return "", err
return res, err
}

// Open samples
fh, err := os.Open(convertedPath)
if err != nil {
return "", err
return res, err
}
defer fh.Close()

// Read samples
d := wav.NewDecoder(fh)
buf, err := d.FullPCMBuffer()
if err != nil {
return "", err
return res, err
}

data := buf.AsFloat32Buffer().Data

// Process samples
context, err := model.NewContext()
if err != nil {
return "", err
return res, err

}

Expand All @@ -74,17 +89,25 @@ func Transcript(model whisper.Model, audiopath, language string, threads uint) (
}

if err := context.Process(data, nil, nil); err != nil {
return "", err
return res, err
}

text := ""
for {
segment, err := context.NextSegment()
s, err := context.NextSegment()
if err != nil {
break
}
text += segment.Text

var tokens []int
for _, t := range(s.Tokens) {
tokens = append(tokens, t.Id)
}

segment := Segment{Id: s.Num, Text: s.Text, Start:s.Start, End: s.End, Tokens: tokens}
res.Segments = append(res.Segments, segment)

res.Text += s.Text
}

return text, nil
return res, nil
}

0 comments on commit a6839fd

Please sign in to comment.