Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Audio: add support for timestamp_granularities #374

Merged
merged 2 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ $response = $client->audio()->transcribe([
'model' => 'whisper-1',
'file' => fopen('audio.mp3', 'r'),
'response_format' => 'verbose_json',
'timestamp_granularities' => ['segment', 'word']
]);

$response->task; // 'transcribe'
Expand All @@ -412,6 +413,12 @@ foreach ($response->segments as $segment) {
$segment->transient; // false
}

foreach ($response->words as $word) {
$word->word; // 'Hello'
$word->start; // 0.31
$word->end; // 0.92
}

$response->toArray(); // ['task' => 'transcribe', ...]
```

Expand Down
2 changes: 1 addition & 1 deletion src/Resources/Audio.php
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public function transcribe(array $parameters): TranscriptionResponse
{
$payload = Payload::upload('audio/transcriptions', $parameters);

/** @var Response<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}> $response */
/** @var Response<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}> $response */
$response = $this->transporter->requestObject($payload);

return TranscriptionResponse::from($response->data(), $response->meta());
Expand Down
17 changes: 14 additions & 3 deletions src/Responses/Audio/TranscriptionResponse.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
use OpenAI\Testing\Responses\Concerns\Fakeable;

/**
* @implements ResponseContract<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}>
* @implements ResponseContract<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}>
*/
final class TranscriptionResponse implements ResponseContract, ResponseHasMetaInformationContract
{
/**
* @use ArrayAccessible<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}>
* @use ArrayAccessible<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}>
*/
use ArrayAccessible;

Expand All @@ -26,12 +26,14 @@ final class TranscriptionResponse implements ResponseContract, ResponseHasMetaIn

/**
* @param array<int, TranscriptionResponseSegment> $segments
* @param array<int, TranscriptionResponseWord> $words
*/
private function __construct(
public readonly ?string $task,
public readonly ?string $language,
public readonly ?float $duration,
public readonly array $segments,
public readonly array $words,
public readonly string $text,
private readonly MetaInformation $meta,
) {
Expand All @@ -40,7 +42,7 @@ private function __construct(
/**
* Acts as static factory, and returns a new Response instance.
*
* @param array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}|string $attributes
* @param array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}|string $attributes
*/
public static function from(array|string $attributes, MetaInformation $meta): self
{
Expand All @@ -52,11 +54,16 @@ public static function from(array|string $attributes, MetaInformation $meta): se
$result
), $attributes['segments']) : [];

$words = isset($attributes['words']) ? array_map(fn (array $result): TranscriptionResponseWord => TranscriptionResponseWord::from(
$result
), $attributes['words']) : [];

return new self(
$attributes['task'] ?? null,
$attributes['language'] ?? null,
$attributes['duration'] ?? null,
$segments,
$words,
$attributes['text'],
$meta,
);
Expand All @@ -75,6 +82,10 @@ public function toArray(): array
static fn (TranscriptionResponseSegment $result): array => $result->toArray(),
$this->segments,
),
'words' => array_map(
static fn (TranscriptionResponseWord $result): array => $result->toArray(),
$this->words,
),
'text' => $this->text,
];
}
Expand Down
52 changes: 52 additions & 0 deletions src/Responses/Audio/TranscriptionResponseWord.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<?php

declare(strict_types=1);

namespace OpenAI\Responses\Audio;

use OpenAI\Contracts\ResponseContract;
use OpenAI\Responses\Concerns\ArrayAccessible;

/**
* @implements ResponseContract<array{word: string, start: float, end: float}>
*/
final class TranscriptionResponseWord implements ResponseContract
{
/**
* @use ArrayAccessible<array{word: string, start: float, end: float}>
*/
use ArrayAccessible;

private function __construct(
public readonly string $word,
public readonly float $start,
public readonly float $end,
) {
}

/**
* Acts as static factory, and returns a new Response instance.
*
* @param array{word: string, start: float, end: float} $attributes
*/
public static function from(array $attributes): self
{
return new self(
$attributes['word'],
$attributes['start'],
$attributes['end'],
);
}

/**
* {@inheritDoc}
*/
public function toArray(): array
{
return [
'word' => $this->word,
'start' => $this->start,
'end' => $this->end,
];
}
}
10 changes: 9 additions & 1 deletion src/ValueObjects/Transporter/Payload.php
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,22 @@ public function toRequest(BaseUri $baseUri, Headers $headers, QueryParams $query
if ($this->contentType === ContentType::MULTIPART) {
$streamBuilder = new MultipartStreamBuilder($psr17Factory);

/** @var array<string, StreamInterface|string|int|float|bool> $parameters */
/** @var array<string, StreamInterface|string|int|float|bool|array<int, string>> $parameters */
$parameters = $this->parameters;

foreach ($parameters as $key => $value) {
if (is_int($value) || is_float($value) || is_bool($value)) {
$value = (string) $value;
}

if (is_array($value)) {
foreach ($value as $nestedValue) {
$streamBuilder->addResource($key.'[]', $nestedValue);
}

continue;
}

$streamBuilder->addResource($key, $value);
}

Expand Down
22 changes: 22 additions & 0 deletions tests/Fixtures/Audio.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,28 @@ function audioTranscriptionVerboseJson(): array
'transient' => false,
],
],
'words' => [
[
'word' => 'Hello',
'start' => 0.31999999284744,
'end' => 0.9200000166893,
],
[
'word' => 'how',
'start' => 1.0,
'end' => 1.5599999427795,
],
[
'word' => 'are',
'start' => 1.5599999427795,
'end' => 1.8799999952316,
],
[
'word' => 'you',
'start' => 1.8799999952316,
'end' => 2.1600000858307,
],
],
'text' => 'Hello, how are you?',
];
}
Expand Down
8 changes: 8 additions & 0 deletions tests/Responses/Audio/TranscriptionResponse.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use OpenAI\Responses\Audio\TranscriptionResponse;
use OpenAI\Responses\Audio\TranscriptionResponseSegment;
use OpenAI\Responses\Audio\TranscriptionResponseWord;
use OpenAI\Responses\Meta\MetaInformation;

test('from json', function () {
Expand All @@ -13,6 +14,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe('Hello, how are you?')
->meta()->toBeInstanceOf(MetaInformation::class);
});
Expand All @@ -28,6 +30,9 @@
->segments->toBeArray()
->segments->toHaveCount(1)
->segments->each->toBeInstanceOf(TranscriptionResponseSegment::class)
->words->toBeArray()
->words->toHaveCount(4)
->words->each->toBeInstanceOf(TranscriptionResponseWord::class)
->text->toBe('Hello, how are you?')
->meta()->toBeInstanceOf(MetaInformation::class);
});
Expand All @@ -41,6 +46,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe('Hello, how are you?')
->meta()->toBeInstanceOf(MetaInformation::class);
});
Expand All @@ -54,6 +60,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe(<<<'SRT'
1
00:00:00,000 --> 00:00:04,000
Expand All @@ -73,6 +80,7 @@
->language->toBeNull()
->duration->toBeNull()
->segments->toBeEmpty()
->words->toBeEmpty()
->text->toBe(<<<'VTT'
WEBVTT

Expand Down
20 changes: 20 additions & 0 deletions tests/Responses/Audio/TranscriptionResponseWord.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?php

use OpenAI\Responses\Audio\TranscriptionResponseWord;

test('from', function () {
$result = TranscriptionResponseWord::from(audioTranscriptionVerboseJson()['words'][0]);

expect($result)
->toBeInstanceOf(TranscriptionResponseWord::class)
->word->toBe('Hello')
->start->toBe(0.31999999284744)
->end->toBe(0.9200000166893);
});

test('to array', function () {
$result = TranscriptionResponseWord::from(audioTranscriptionVerboseJson()['words'][0]);

expect($result->toArray())
->toBe(audioTranscriptionVerboseJson()['words'][0]);
});