openai · stainless-app · Mar 20, 2025 · Mar 19, 2025 · Mar 20, 2025 · Mar 20, 2025
@@ -1,3 +1,3 @@
 {
-  ".": "4.88.0"
+  ".": "4.89.0"
 }
@@ -1,2 +1,2 @@
-configured_endpoints: 81
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-b26121d5df6eb5d3032a45a267473798b15fcfec76dd44a3256cf1238be05fa4.yml
+configured_endpoints: 82
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c22f59c66aec7914b6ee653d3098d1c1c8c16c180d2a158e819c8ddbf476f74b.yml
@@ -1,5 +1,19 @@
 # Changelog
 
+## 4.89.0 (2025-03-20)
+
+Full Changelog: [v4.88.0...v4.89.0](https://github.com/openai/openai-node/compare/v4.88.0...v4.89.0)
+
+### Features
+
+* add audio helpers ([ea1b6b4](https://github.com/openai/openai-node/commit/ea1b6b4ef38813af568b3662037519da9404b80e))
+* **api:** new models for TTS, STT, + new audio features for Realtime ([#1407](https://github.com/openai/openai-node/issues/1407)) ([142933a](https://github.com/openai/openai-node/commit/142933ae70d06045dbf4661cd72c7fa35ae7903d))
+
+
+### Chores
+
+* **internal:** version bump ([#1400](https://github.com/openai/openai-node/issues/1400)) ([6838ab4](https://github.com/openai/openai-node/commit/6838ab4268c7c0e083e7be21ef1a51bdea0f0b57))
+
 ## 4.88.0 (2025-03-19)
 
 Full Changelog: [v4.87.4...v4.88.0](https://github.com/openai/openai-node/compare/v4.87.4...v4.88.0)

@@ -142,7 +142,11 @@ Types:
 Types:
 
 - <code><a href="./src/resources/audio/transcriptions.ts">Transcription</a></code>
+- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionInclude</a></code>
 - <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionSegment</a></code>
+- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionStreamEvent</a></code>
+- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionTextDeltaEvent</a></code>
+- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionTextDoneEvent</a></code>
 - <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionVerbose</a></code>
 - <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionWord</a></code>
 - <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionCreateResponse</a></code>
@@ -306,7 +310,9 @@ Types:
 - <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemDeleteEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemDeletedEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionCompletedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionDeltaEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionFailedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemRetrieveEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemTruncateEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemTruncatedEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemWithReference</a></code>
@@ -343,6 +349,8 @@ Types:
 - <code><a href="./src/resources/beta/realtime/realtime.ts">SessionCreatedEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">SessionUpdateEvent</a></code>
 - <code><a href="./src/resources/beta/realtime/realtime.ts">SessionUpdatedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">TranscriptionSessionUpdate</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">TranscriptionSessionUpdatedEvent</a></code>
 
 ### Sessions
 
@@ -355,6 +363,16 @@ Methods:
 
 - <code title="post /realtime/sessions">client.beta.realtime.sessions.<a href="./src/resources/beta/realtime/sessions.ts">create</a>({ ...params }) -> SessionCreateResponse</code>
 
+### TranscriptionSessions
+
+Types:
+
+- <code><a href="./src/resources/beta/realtime/transcription-sessions.ts">TranscriptionSession</a></code>
+
+Methods:
+
+- <code title="post /realtime/transcription_sessions">client.beta.realtime.transcriptionSessions.<a href="./src/resources/beta/realtime/transcription-sessions.ts">create</a>({ ...params }) -> TranscriptionSession</code>
+
 ## Assistants
 
 Types:

@@ -0,0 +1,19 @@
+import OpenAI from 'openai';
+import { recordAudio } from 'openai/helpers/audio';
+
+const openai = new OpenAI();
+
+async function main(): Promise<void> {
+  console.log('Recording for 5 seconds...');
+  const response = await recordAudio({ timeout: 5000, device: 4 });
+
+  console.log('Transcribing...');
+  const transcription = await openai.audio.transcriptions.create({
+    file: response,
+    model: 'whisper-1',
+  });
+
+  console.log(transcription.text);
+}
+
+main().catch(console.error);
@@ -0,0 +1,23 @@
+import OpenAI from 'openai';
+import { playAudio } from 'openai/helpers/audio';
+
+const openai = new OpenAI();
+
+const exampleText = `
+I see skies of blue and clouds of white
+The bright blessed days, the dark sacred nights
+And I think to myself
+What a wonderful world
+`.trim();
+
+async function main(): Promise<void> {
+  const response = await openai.audio.speech.create({
+    model: 'tts-1',
+    voice: 'nova',
+    input: exampleText,
+  });
+
+  await playAudio(response);
+}
+
+main().catch(console.error);
@@ -1,6 +1,6 @@
 {
   "name": "@openai/openai",
-  "version": "4.88.0",
+  "version": "4.89.0",
   "exports": {
     ".": "./index.ts",
     "./helpers/zod": "./helpers/zod.ts",

@@ -1,6 +1,6 @@
 {
   "name": "openai",
-  "version": "4.88.0",
+  "version": "4.89.0",
   "description": "The official TypeScript library for the OpenAI API",
   "author": "OpenAI <support@openai.com>",
   "types": "dist/index.d.ts",

@@ -0,0 +1,145 @@
+import { File } from 'formdata-node';
+import { spawn } from 'node:child_process';
+import { Readable } from 'node:stream';
+import { platform, versions } from 'node:process';
+import { Response } from 'openai/_shims';
+
+const DEFAULT_SAMPLE_RATE = 24000;
+const DEFAULT_CHANNELS = 1;
+
+const isNode = Boolean(versions?.node);
+
+const recordingProviders: Record<NodeJS.Platform, string> = {
+  win32: 'dshow',
+  darwin: 'avfoundation',
+  linux: 'alsa',
+  aix: 'alsa',
+  android: 'alsa',
+  freebsd: 'alsa',
+  haiku: 'alsa',
+  sunos: 'alsa',
+  netbsd: 'alsa',
+  openbsd: 'alsa',
+  cygwin: 'dshow',
+};
+
+function isResponse(stream: NodeJS.ReadableStream | Response | File): stream is Response {
+  return typeof (stream as any).body !== 'undefined';
+}
+
+function isFile(stream: NodeJS.ReadableStream | Response | File): stream is File {
+  return stream instanceof File;
+}
+
+async function nodejsPlayAudio(stream: NodeJS.ReadableStream | Response | File): Promise<void> {
+  return new Promise((resolve, reject) => {
+    try {
+      const ffplay = spawn('ffplay', ['-autoexit', '-nodisp', '-i', 'pipe:0']);
+
+      if (isResponse(stream)) {
+        stream.body.pipe(ffplay.stdin);
+      } else if (isFile(stream)) {
+        Readable.from(stream.stream()).pipe(ffplay.stdin);
+      } else {
+        stream.pipe(ffplay.stdin);
+      }
+
+      ffplay.on('close', (code: number) => {
+        if (code !== 0) {
+          reject(new Error(`ffplay process exited with code ${code}`));
+        }
+        resolve();
+      });
+    } catch (error) {
+      reject(error);
+    }
+  });
+}
+
+export async function playAudio(input: NodeJS.ReadableStream | Response | File): Promise<void> {
+  if (isNode) {
+    return nodejsPlayAudio(input);
+  }
+
+  throw new Error(
+    'Play audio is not supported in the browser yet. Check out https://npm.im/wavtools as an alternative.',
+  );
+}
+
+type RecordAudioOptions = {
+  signal?: AbortSignal;
+  device?: number;
+  timeout?: number;
+};
+
+function nodejsRecordAudio({ signal, device, timeout }: RecordAudioOptions = {}): Promise<File> {
+  return new Promise((resolve, reject) => {
+    const data: any[] = [];
+    const provider = recordingProviders[platform];
+    try {
+      const ffmpeg = spawn(
+        'ffmpeg',
+        [
+          '-f',
+          provider,
+          '-i',
+          `:${device ?? 0}`, // default audio input device; adjust as needed
+          '-ar',
+          DEFAULT_SAMPLE_RATE.toString(),
+          '-ac',
+          DEFAULT_CHANNELS.toString(),
+          '-f',
+          'wav',
+          'pipe:1',
+        ],
+        {
+          stdio: ['ignore', 'pipe', 'pipe'],
+        },
+      );
+
+      ffmpeg.stdout.on('data', (chunk) => {
+        data.push(chunk);
+      });
+
+      ffmpeg.on('error', (error) => {
+        console.error(error);
+        reject(error);
+      });
+
+      ffmpeg.on('close', (code) => {
+        returnData();
+      });
+
+      function returnData() {
+        const audioBuffer = Buffer.concat(data);
+        const audioFile = new File([audioBuffer], 'audio.wav', { type: 'audio/wav' });
+        resolve(audioFile);
+      }
+
+      if (typeof timeout === 'number' && timeout > 0) {
+        const internalSignal = AbortSignal.timeout(timeout);
+        internalSignal.addEventListener('abort', () => {
+          ffmpeg.kill('SIGTERM');
+        });
+      }
+
+      if (signal) {
+        signal.addEventListener('abort', () => {
+          ffmpeg.kill('SIGTERM');
+        });
+      }
+    } catch (error) {
+      reject(error);
+    }
+  });
+}
+
+export async function recordAudio(options: RecordAudioOptions = {}) {
+  if (isNode) {
+    return nodejsRecordAudio(options);
+  }
+
+  throw new Error(
+    'Record audio is not supported in the browser. Check out https://npm.im/wavtools as an alternative.',
+  );
+}
@@ -7,8 +7,14 @@ import * as TranscriptionsAPI from './transcriptions';
 import {
   Transcription,
   TranscriptionCreateParams,
+  TranscriptionCreateParamsNonStreaming,
+  TranscriptionCreateParamsStreaming,
   TranscriptionCreateResponse,
+  TranscriptionInclude,
   TranscriptionSegment,
+  TranscriptionStreamEvent,
+  TranscriptionTextDeltaEvent,
+  TranscriptionTextDoneEvent,
   TranscriptionVerbose,
   TranscriptionWord,
   Transcriptions,
@@ -28,11 +34,12 @@ export class Audio extends APIResource {
   speech: SpeechAPI.Speech = new SpeechAPI.Speech(this._client);
 }
 
-export type AudioModel = 'whisper-1';
+export type AudioModel = 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe';
 
 /**
  * The format of the output, in one of these options: `json`, `text`, `srt`,
- * `verbose_json`, or `vtt`.
+ * `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+ * the only supported format is `json`.
  */
 export type AudioResponseFormat = 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt';
 
@@ -46,11 +53,17 @@ export declare namespace Audio {
   export {
     Transcriptions as Transcriptions,
     type Transcription as Transcription,
+    type TranscriptionInclude as TranscriptionInclude,
     type TranscriptionSegment as TranscriptionSegment,
+    type TranscriptionStreamEvent as TranscriptionStreamEvent,
+    type TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent,
+    type TranscriptionTextDoneEvent as TranscriptionTextDoneEvent,
     type TranscriptionVerbose as TranscriptionVerbose,
     type TranscriptionWord as TranscriptionWord,
     type TranscriptionCreateResponse as TranscriptionCreateResponse,
     type TranscriptionCreateParams as TranscriptionCreateParams,
+    type TranscriptionCreateParamsNonStreaming as TranscriptionCreateParamsNonStreaming,
+    type TranscriptionCreateParamsStreaming as TranscriptionCreateParamsStreaming,
   };
 
   export {

@@ -5,11 +5,17 @@ export { Speech, type SpeechModel, type SpeechCreateParams } from './speech';
 export {
   Transcriptions,
   type Transcription,
+  type TranscriptionInclude,
   type TranscriptionSegment,
+  type TranscriptionStreamEvent,
+  type TranscriptionTextDeltaEvent,
+  type TranscriptionTextDoneEvent,
   type TranscriptionVerbose,
   type TranscriptionWord,
   type TranscriptionCreateResponse,
   type TranscriptionCreateParams,
+  type TranscriptionCreateParamsNonStreaming,
+  type TranscriptionCreateParamsStreaming,
 } from './transcriptions';
 export {
   Translations,

@@ -18,7 +18,7 @@ export class Speech extends APIResource {
   }
 }
 
-export type SpeechModel = 'tts-1' | 'tts-1-hd';
+export type SpeechModel = 'tts-1' | 'tts-1-hd' | 'gpt-4o-mini-tts';
 
 export interface SpeechCreateParams {
   /**
@@ -28,7 +28,7 @@ export interface SpeechCreateParams {
 
   /**
    * One of the available [TTS models](https://platform.openai.com/docs/models#tts):
-   * `tts-1` or `tts-1-hd`
+   * `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
    */
   model: (string & {}) | SpeechModel;
 
@@ -40,6 +40,12 @@ export interface SpeechCreateParams {
    */
   voice: 'alloy' | 'ash' | 'coral' | 'echo' | 'fable' | 'onyx' | 'nova' | 'sage' | 'shimmer';
 
+  /**
+   * Control the voice of your generated audio with additional instructions. Does not
+   * work with `tts-1` or `tts-1-hd`.
+   */
+  instructions?: string;
+
   /**
    * The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`,
    * `wav`, and `pcm`.