✨ feat: Add OpenAI STT

lobehub · Nov 8, 2023 · e6b1916 · e6b1916
1 parent 10592aa
commit e6b1916
Show file tree

Hide file tree

Showing 41 changed files with 853 additions and 311 deletions.
diff --git a/.eslintrc.js b/.eslintrc.js
@@ -3,5 +3,6 @@ const config = require('@lobehub/lint').eslint;
 config.rules['no-param-reassign'] = 0;
 config.rules['unicorn/no-array-callback-reference'] = 0;
 config.rules['unicorn/no-array-for-each'] = 0;
+config.rules['unicorn/no-useless-undefined'] = 0;
 
 module.exports = config;
diff --git a/src/const/api.ts b/src/const/api.ts
@@ -10,7 +10,7 @@ export const AZURE_SPEECH_REGION = process.env.AZURE_SPEECH_REGION || '';
 export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || '';
 export const OPENAI_PROXY_URL = process.env.OPENAI_PROXY_URL || 'https://api.openai.com/v1';
 export const OPENAI_TTS_URL = (api?: string) => urlJoin(api || OPENAI_PROXY_URL, 'audio/speech');
-export const OPENAI_STT_URL = (api: string) =>
+export const OPENAI_STT_URL = (api?: string) =>
   urlJoin(api || OPENAI_PROXY_URL, 'audio/transcriptions');
 export const EDDGE_PROXY_URL =
   process.env.EDDGE_PROXY_URL ||

diff --git a/src/index.ts b/src/index.ts
@@ -3,9 +3,11 @@ export { type EdgeSpeechOptions, fetchEdgeSpeech } from './services/fetchEdgeSpe
 export { fetchMicrosoftSpeech, type MicrosoftSpeechOptions } from './services/fetchMicrosoftSpeech';
 export { fetchOpenaiSTT, type OpenaiSttOptions } from './services/fetchOpenaiSTT';
 export { fetchOpenaiTTS, type OpenaiTtsOptions } from './services/fetchOpenaiTTS';
+export { useAudioRecorder } from './useAudioRecorder';
 export { useAzureSpeech } from './useAzureSpeech';
 export { useEdgeSpeech } from './useEdgeSpeech';
 export { useMicrosoftSpeech } from './useMicrosoftSpeech';
+export { useOpenaiSTT, useOpenaiSTTWithPSR, useOpenaiSTTWithSR } from './useOpenaiSTT';
 export { useOpenaiTTS } from './useOpenaiTTS';
 export { usePersistedSpeechRecognition } from './useSpeechRecognition/usePersistedSpeechRecognition';
 export { useSpeechRecognition } from './useSpeechRecognition/useSpeechRecognition';

diff --git a/src/server/handleAzureSpeechRequest.ts b/src/server/handleAzureSpeechRequest.ts
@@ -8,7 +8,7 @@ import {
   SpeechSynthesizer,
 } from 'microsoft-cognitiveservices-speech-sdk';
 
-import { AZURE_SPEECH_KEY, AZURE_SPEECH_REGION } from '@/const/api';
+import { AZURE_SPEECH_KEY, AZURE_SPEECH_REGION } from '../const/api';
 
 const fetchAzureSpeech = async (ssml: string, { api }: any): Promise<ArrayBuffer> => {
   const key = api.key || AZURE_SPEECH_KEY;

diff --git a/src/server/handleMicrosoftSpeechRequest.ts b/src/server/handleMicrosoftSpeechRequest.ts
@@ -1,6 +1,6 @@
 import { v4 as uuidv4 } from 'uuid';
 
-import { MICROSOFT_SPPECH_URL } from '@/const/api';
+import { MICROSOFT_SPPECH_URL } from '../const/api';
 
 export const handleMicrosoftSpeechRequest = async (req: Request) => {
   const DEFAULT_HEADERS = new Headers({

diff --git a/src/services/fetchAzureSpeech.ts b/src/services/fetchAzureSpeech.ts
@@ -1,4 +1,5 @@
 import { AZURE_SPEECH_PROXY_URL } from '@/const/api';
+import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
 import { type SsmlOptions, genSSML } from '@/utils/genSSML';
 
 export interface AzureSpeechOptions extends SsmlOptions {
@@ -11,7 +12,7 @@ export interface AzureSpeechOptions extends SsmlOptions {
 export const fetchAzureSpeech = async (
   text: string,
   { api, ...options }: AzureSpeechOptions,
-): Promise<AudioBufferSourceNode> => {
+): Promise<Blob> => {
   const data = JSON.stringify({
     api,
     ssml: genSSML(text, options),
@@ -28,10 +29,6 @@ export const fetchAzureSpeech = async (
     throw new Error('Network response was not ok');
   }
 
-  const audioData = await response.arrayBuffer();
-  const audioContext = new AudioContext();
-  const audioBufferSource = audioContext.createBufferSource();
-  audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
-  audioBufferSource.connect(audioContext.destination);
-  return audioBufferSource;
+  const arrayBuffer = await response.arrayBuffer();
+  return await arrayBufferConvert(arrayBuffer);
 };
diff --git a/src/services/fetchEdgeSpeech.ts b/src/services/fetchEdgeSpeech.ts
@@ -2,6 +2,7 @@ import qs from 'query-string';
 import { v4 as uuidv4 } from 'uuid';
 
 import { EDDGE_API_TOKEN, EDDGE_PROXY_URL } from '@/const/api';
+import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
 import { type SsmlOptions, genSSML } from '@/utils/genSSML';
 import { genSendContent } from '@/utils/genSendContent';
 import { getHeadersAndData } from '@/utils/getHeadersAndData';
@@ -12,11 +13,12 @@ export interface EdgeSpeechOptions extends Pick<SsmlOptions, 'name'> {
     proxy: string;
   };
 }
-export const fetchEdgeSpeech = async (text: string, { api, ...options }: EdgeSpeechOptions) => {
+export const fetchEdgeSpeech = async (
+  text: string,
+  { api, ...options }: EdgeSpeechOptions,
+): Promise<Blob> => {
   const connectId = uuidv4().replaceAll('-', '');
   const date = new Date().toString();
-  const audioContext = new AudioContext();
-  const audioBufferSource = audioContext.createBufferSource();
 
   const ws = new WebSocket(
     qs.stringifyUrl({
@@ -61,7 +63,7 @@ export const fetchEdgeSpeech = async (text: string, { api, ...options }: EdgeSpe
     );
   });
 
-  return new Promise<AudioBufferSourceNode>((resolve) => {
+  return new Promise((resolve) => {
     let audioData = new ArrayBuffer(0);
     let downloadAudio = false;
 
@@ -77,9 +79,7 @@ export const fetchEdgeSpeech = async (text: string, { api, ...options }: EdgeSpe
           case 'turn.end': {
             downloadAudio = false;
             if (!audioData.byteLength) return;
-            audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
-            audioBufferSource.connect(audioContext.destination);
-            resolve(audioBufferSource);
+            resolve(await arrayBufferConvert(audioData));
             break;
           }
         }

diff --git a/src/services/fetchMicrosoftSpeech.ts b/src/services/fetchMicrosoftSpeech.ts
@@ -1,4 +1,5 @@
 import { MICROSOFT_SPEECH_PROXY_URL } from '@/const/api';
+import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
 import { type SsmlOptions } from '@/utils/genSSML';
 import { genSSML } from '@/utils/genSSML';
 
@@ -9,7 +10,7 @@ export interface MicrosoftSpeechOptions extends SsmlOptions {
 export const fetchMicrosoftSpeech = async (
   text: string,
   { api, ...options }: MicrosoftSpeechOptions,
-): Promise<AudioBufferSourceNode> => {
+): Promise<Blob> => {
   const data = JSON.stringify({
     offsetInPlainText: 0,
     properties: {
@@ -30,10 +31,6 @@ export const fetchMicrosoftSpeech = async (
     throw new Error('Network response was not ok');
   }
 
-  const audioData = await response.arrayBuffer();
-  const audioContext = new AudioContext();
-  const audioBufferSource = audioContext.createBufferSource();
-  audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
-  audioBufferSource.connect(audioContext.destination);
-  return audioBufferSource;
+  const arrayBuffer = await response.arrayBuffer();
+  return await arrayBufferConvert(arrayBuffer);
 };
diff --git a/src/services/fetchOpenaiSTT.ts b/src/services/fetchOpenaiSTT.ts
@@ -3,9 +3,9 @@ import { v4 as uuidv4 } from 'uuid';
 import { OPENAI_API_KEY, OPENAI_STT_URL } from '@/const/api';
 
 export interface OpenaiSttOptions {
-  api: {
+  api?: {
     key: string;
-    proxy: string;
+    proxy?: string;
   };
   model?: 'whisper-1';
 }
@@ -15,16 +15,18 @@ export const fetchOpenaiSTT = async (
   speech: Blob,
   { api, model = 'whisper-1' }: OpenaiSttOptions,
 ): Promise<string> => {
-  const key = api.key || OPENAI_API_KEY;
-  const url = OPENAI_STT_URL(api.proxy);
+  const key = api?.key || OPENAI_API_KEY;
+  const url = OPENAI_STT_URL(api?.proxy);
 
   const headers = new Headers({
-    'Authorization': `Bearer ${key}`,
-    'Content-Type': 'multipart/form-data',
+    Authorization: `Bearer ${key}`,
   });
 
+  const filename = `${uuidv4()}.webm`;
+  const file = new File([speech], filename, { type: 'audio/webm' });
+
   const body = new FormData();
-  body.append('file', speech, `${uuidv4()}.webm`);
+  body.append('file', file);
   body.append('model', model);
 
   const response: Response = await fetch(url, { body, headers, method: 'POST' });
@@ -35,5 +37,7 @@ export const fetchOpenaiSTT = async (
 
   const json = await response.json();
 
+  console.log(json);
+
   return json?.text;
 };
diff --git a/src/services/fetchOpenaiTTS.ts b/src/services/fetchOpenaiTTS.ts
@@ -1,4 +1,5 @@
 import { OPENAI_API_KEY, OPENAI_TTS_URL } from '@/const/api';
+import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
 import { type SsmlOptions } from '@/utils/genSSML';
 
 export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
@@ -11,12 +12,10 @@ export interface OpenaiTtsOptions extends Pick<SsmlOptions, 'name'> {
   model?: 'tts-1' | 'tts-1-hd';
   name: OpenaiVoice;
 }
-
-// 纯文本生成语音
 export const fetchOpenaiTTS = async (
   text: string,
   { api, model = 'tts-1', ...options }: OpenaiTtsOptions,
-): Promise<AudioBufferSourceNode> => {
+): Promise<Blob> => {
   const key = api.key || OPENAI_API_KEY;
   const url = OPENAI_TTS_URL(api.proxy);
 
@@ -37,10 +36,6 @@ export const fetchOpenaiTTS = async (
     throw new Error('Network response was not ok');
   }
 
-  const audioData = await response.arrayBuffer();
-  const audioContext = new AudioContext();
-  const audioBufferSource = audioContext.createBufferSource();
-  audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
-  audioBufferSource.connect(audioContext.destination);
-  return audioBufferSource;
+  const arrayBuffer = await response.arrayBuffer();
+  return await arrayBufferConvert(arrayBuffer);
 };
diff --git a/src/useAudioRecorder/demos/index.tsx b/src/useAudioRecorder/demos/index.tsx
@@ -0,0 +1,23 @@
+import { useAudioRecorder } from '@lobehub/tts';
+import { Icon } from '@lobehub/ui';
+import { Button } from 'antd';
+import { Mic, StopCircle } from 'lucide-react';
+import { Flexbox } from 'react-layout-kit';
+
+export default () => {
+  const { isRecording, start, stop, url, formattedTime } = useAudioRecorder();
+  return (
+    <Flexbox gap={8}>
+      {isRecording ? (
+        <Button block icon={<Icon icon={StopCircle} />} onClick={stop}>
+          Stop {formattedTime}
+        </Button>
+      ) : (
+        <Button block icon={<Icon icon={Mic} />} onClick={start} type={'primary'}>
+          Record
+        </Button>
+      )}
+      {url && <audio controls src={url} />}
+    </Flexbox>
+  );
+};
diff --git a/src/useAudioRecorder/index.md b/src/useAudioRecorder/index.md
@@ -0,0 +1,9 @@
+---
+nav: Components
+group: STT
+title: useAudioRecorder
+---
+
+## hooks
+
+<code src="./demos/index.tsx" center></code>
diff --git a/src/useAudioRecorder/index.ts b/src/useAudioRecorder/index.ts
@@ -0,0 +1,74 @@
+import { useCallback, useMemo, useState } from 'react';
+
+import { secondsToMinutesAndSeconds } from '@/utils/secondsToMinutesAndSeconds';
+
+export const useAudioRecorder = (onBolbAvailable?: (blob: Blob) => void) => {
+  const [isRecording, setIsRecording] = useState(false);
+
+  const [time, setTime] = useState(0);
+  const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder>();
+  // eslint-disable-next-line no-undef
+  const [timerInterval, setTimerInterval] = useState<NodeJS.Timer>();
+  const [blob, setBlob] = useState<Blob>();
+
+  const _startTimer = useCallback(() => {
+    const interval = setInterval(() => {
+      setTime((time) => time + 1);
+    }, 1000);
+    setTimerInterval(interval);
+  }, []);
+
+  const _stopTimer = useCallback(() => {
+    // @ts-ignore
+    // eslint-disable-next-line @typescript-eslint/no-unused-expressions
+    timerInterval !== undefined && clearInterval(timerInterval);
+    // @ts-ignore
+    setTimerInterval();
+  }, [timerInterval]);
+
+  const start = useCallback(() => {
+    setBlob(undefined);
+    if (timerInterval !== undefined) return;
+
+    navigator.mediaDevices
+      .getUserMedia({ audio: true })
+      .then((stream) => {
+        setIsRecording(true);
+        const recorder: MediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' });
+        setMediaRecorder(recorder);
+        recorder.start();
+        _startTimer();
+
+        recorder.addEventListener('dataavailable', (event) => {
+          setBlob(event.data);
+          onBolbAvailable?.(event.data);
+          recorder.stream.getTracks().forEach((t) => t.stop());
+          // @ts-ignore
+          setMediaRecorder();
+        });
+      })
+      .catch((error: DOMException) => {
+        console.log(error.name, error.message, error.cause);
+      });
+  }, [timerInterval, _startTimer]);
+
+  const stop = useCallback(() => {
+    mediaRecorder?.stop();
+    _stopTimer();
+    setTime(0);
+    setIsRecording(false);
+  }, [mediaRecorder, _stopTimer]);
+
+  const url = useMemo(() => blob && URL.createObjectURL(blob), [blob]);
+
+  return {
+    blob,
+    formattedTime: secondsToMinutesAndSeconds(time),
+    isRecording,
+    mediaRecorder,
+    start,
+    stop,
+    time,
+    url,
+  };
+};
diff --git a/src/useAzureSpeech/demos/index.tsx b/src/useAzureSpeech/demos/index.tsx
@@ -60,7 +60,7 @@ export default () => {
     },
     { store },
   );
-  const { setText, isLoading, isPlaying, start, stop } = useAzureSpeech(defaultText, {
+  const { setText, isLoading, isPlaying, start, stop, url } = useAzureSpeech(defaultText, {
     api,
     ...options,
   });
@@ -81,6 +81,7 @@ export default () => {
           </Button>
         )}
         <Input.TextArea defaultValue={defaultText} onChange={(e) => setText(e.target.value)} />
+        {url && <audio controls src={url} />}
       </Flexbox>
     </StoryBook>
   );