Skip to content

Commit

Permalink
Audio: add support for timestamp_granularities (#374)
Browse files Browse the repository at this point in the history
  • Loading branch information
gehrisandro committed Apr 15, 2024
1 parent a4877bd commit 3d0ba7e
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 5 deletions.
7 changes: 7 additions & 0 deletions README.md
Expand Up @@ -391,6 +391,7 @@ $response = $client->audio()->transcribe([
'model' => 'whisper-1',
'file' => fopen('audio.mp3', 'r'),
'response_format' => 'verbose_json',
'timestamp_granularities' => ['segment', 'word']
]);

$response->task; // 'transcribe'
Expand All @@ -412,6 +413,12 @@ foreach ($response->segments as $segment) {
$segment->transient; // false
}

foreach ($response->words as $word) {
$word->word; // 'Hello'
$word->start; // 0.31
$word->end; // 0.92
}

$response->toArray(); // ['task' => 'transcribe', ...]
```

Expand Down
2 changes: 1 addition & 1 deletion src/Resources/Audio.php
Expand Up @@ -56,7 +56,7 @@ public function transcribe(array $parameters): TranscriptionResponse
{
$payload = Payload::upload('audio/transcriptions', $parameters);

/** @var Response<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}> $response */
/** @var Response<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}> $response */
$response = $this->transporter->requestObject($payload);

return TranscriptionResponse::from($response->data(), $response->meta());
Expand Down
17 changes: 14 additions & 3 deletions src/Responses/Audio/TranscriptionResponse.php
Expand Up @@ -12,12 +12,12 @@
use OpenAI\Testing\Responses\Concerns\Fakeable;

/**
* @implements ResponseContract<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}>
* @implements ResponseContract<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}>
*/
final class TranscriptionResponse implements ResponseContract, ResponseHasMetaInformationContract
{
/**
* @use ArrayAccessible<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}>
* @use ArrayAccessible<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}>
*/
use ArrayAccessible;

Expand All @@ -26,12 +26,14 @@ final class TranscriptionResponse implements ResponseContract, ResponseHasMetaIn

/**
* @param array<int, TranscriptionResponseSegment> $segments
* @param array<int, TranscriptionResponseWord> $words
*/
private function __construct(
public readonly ?string $task,
public readonly ?string $language,
public readonly ?float $duration,
public readonly array $segments,
public readonly array $words,
public readonly string $text,
private readonly MetaInformation $meta,
) {
Expand All @@ -40,7 +42,7 @@ private function __construct(
/**
* Acts as static factory, and returns a new Response instance.
*
* @param array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}|string $attributes
* @param array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}|string $attributes
*/
public static function from(array|string $attributes, MetaInformation $meta): self
{
Expand All @@ -52,11 +54,16 @@ public static function from(array|string $attributes, MetaInformation $meta): se
$result
), $attributes['segments']) : [];

$words = isset($attributes['words']) ? array_map(fn (array $result): TranscriptionResponseWord => TranscriptionResponseWord::from(
$result
), $attributes['words']) : [];

return new self(
$attributes['task'] ?? null,
$attributes['language'] ?? null,
$attributes['duration'] ?? null,
$segments,
$words,
$attributes['text'],
$meta,
);
Expand All @@ -75,6 +82,10 @@ public function toArray(): array
static fn (TranscriptionResponseSegment $result): array => $result->toArray(),
$this->segments,
),
'words' => array_map(
static fn (TranscriptionResponseWord $result): array => $result->toArray(),
$this->words,
),
'text' => $this->text,
];
}
Expand Down
52 changes: 52 additions & 0 deletions src/Responses/Audio/TranscriptionResponseWord.php
@@ -0,0 +1,52 @@
<?php

declare(strict_types=1);

namespace OpenAI\Responses\Audio;

use OpenAI\Contracts\ResponseContract;
use OpenAI\Responses\Concerns\ArrayAccessible;

/**
* @implements ResponseContract<array{word: string, start: float, end: float}>
*/
final class TranscriptionResponseWord implements ResponseContract
{
/**
* @use ArrayAccessible<array{word: string, start: float, end: float}>
*/
use ArrayAccessible;

private function __construct(
public readonly string $word,
public readonly float $start,
public readonly float $end,
) {
}

/**
* Acts as static factory, and returns a new Response instance.
*
* @param array{word: string, start: float, end: float} $attributes
*/
public static function from(array $attributes): self
{
return new self(
$attributes['word'],
$attributes['start'],
$attributes['end'],
);
}

/**
* {@inheritDoc}
*/
public function toArray(): array
{
return [
'word' => $this->word,
'start' => $this->start,
'end' => $this->end,
];
}
}
10 changes: 9 additions & 1 deletion src/ValueObjects/Transporter/Payload.php
Expand Up @@ -164,14 +164,22 @@ public function toRequest(BaseUri $baseUri, Headers $headers, QueryParams $query
if ($this->contentType === ContentType::MULTIPART) {
$streamBuilder = new MultipartStreamBuilder($psr17Factory);

/** @var array<string, StreamInterface|string|int|float|bool> $parameters */
/** @var array<string, StreamInterface|string|int|float|bool|array<int, string>> $parameters */
$parameters = $this->parameters;

foreach ($parameters as $key => $value) {
if (is_int($value) || is_float($value) || is_bool($value)) {
$value = (string) $value;
}

if (is_array($value)) {
foreach ($value as $nestedValue) {
$streamBuilder->addResource($key.'[]', $nestedValue);
}

continue;
}

$streamBuilder->addResource($key, $value);
}

Expand Down
22 changes: 22 additions & 0 deletions tests/Fixtures/Audio.php
Expand Up @@ -33,6 +33,28 @@ function audioTranscriptionVerboseJson(): array
'transient' => false,
],
],
'words' => [
[
'word' => 'Hello',
'start' => 0.31999999284744,
'end' => 0.9200000166893,
],
[
'word' => 'how',
'start' => 1.0,
'end' => 1.5599999427795,
],
[
'word' => 'are',
'start' => 1.5599999427795,
'end' => 1.8799999952316,
],
[
'word' => 'you',
'start' => 1.8799999952316,
'end' => 2.1600000858307,
],
],
'text' => 'Hello, how are you?',
];
}
Expand Down
8 changes: 8 additions & 0 deletions tests/Responses/Audio/TranscriptionResponse.php
Expand Up @@ -2,6 +2,7 @@

use OpenAI\Responses\Audio\TranscriptionResponse;
use OpenAI\Responses\Audio\TranscriptionResponseSegment;
use OpenAI\Responses\Audio\TranscriptionResponseWord;
use OpenAI\Responses\Meta\MetaInformation;

test('from json', function () {
Expand All @@ -13,6 +14,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe('Hello, how are you?')
->meta()->toBeInstanceOf(MetaInformation::class);
});
Expand All @@ -28,6 +30,9 @@
->segments->toBeArray()
->segments->toHaveCount(1)
->segments->each->toBeInstanceOf(TranscriptionResponseSegment::class)
->words->toBeArray()
->words->toHaveCount(4)
->words->each->toBeInstanceOf(TranscriptionResponseWord::class)
->text->toBe('Hello, how are you?')
->meta()->toBeInstanceOf(MetaInformation::class);
});
Expand All @@ -41,6 +46,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe('Hello, how are you?')
->meta()->toBeInstanceOf(MetaInformation::class);
});
Expand All @@ -54,6 +60,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe(<<<'SRT'
1
00:00:00,000 --> 00:00:04,000
Expand All @@ -73,6 +80,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe(<<<'VTT'
WEBVTT

Expand Down
20 changes: 20 additions & 0 deletions tests/Responses/Audio/TranscriptionResponseWord.php
@@ -0,0 +1,20 @@
<?php

use OpenAI\Responses\Audio\TranscriptionResponseWord;

test('from', function () {
$result = TranscriptionResponseWord::from(audioTranscriptionVerboseJson()['words'][0]);

expect($result)
->toBeInstanceOf(TranscriptionResponseWord::class)
->word->toBe('Hello')
->start->toBe(0.31999999284744)
->end->toBe(0.9200000166893);
});

test('to array', function () {
$result = TranscriptionResponseWord::from(audioTranscriptionVerboseJson()['words'][0]);

expect($result->toArray())
->toBe(audioTranscriptionVerboseJson()['words'][0]);
});

0 comments on commit 3d0ba7e

Please sign in to comment.