Skip to content

Commit

Permalink
✨ feat: Add OpenAI STT
Browse files Browse the repository at this point in the history
  • Loading branch information
canisminor1990 committed Nov 8, 2023
1 parent 10592aa commit e6b1916
Show file tree
Hide file tree
Showing 41 changed files with 853 additions and 311 deletions.
1 change: 1 addition & 0 deletions .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ const config = require('@lobehub/lint').eslint;
config.rules['no-param-reassign'] = 0;
config.rules['unicorn/no-array-callback-reference'] = 0;
config.rules['unicorn/no-array-for-each'] = 0;
config.rules['unicorn/no-useless-undefined'] = 0;

module.exports = config;
2 changes: 1 addition & 1 deletion src/const/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export const AZURE_SPEECH_REGION = process.env.AZURE_SPEECH_REGION || '';
export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || '';
export const OPENAI_PROXY_URL = process.env.OPENAI_PROXY_URL || 'https://api.openai.com/v1';
export const OPENAI_TTS_URL = (api?: string) => urlJoin(api || OPENAI_PROXY_URL, 'audio/speech');
export const OPENAI_STT_URL = (api: string) =>
export const OPENAI_STT_URL = (api?: string) =>
urlJoin(api || OPENAI_PROXY_URL, 'audio/transcriptions');
export const EDDGE_PROXY_URL =
process.env.EDDGE_PROXY_URL ||
Expand Down
2 changes: 2 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ export { type EdgeSpeechOptions, fetchEdgeSpeech } from './services/fetchEdgeSpe
export { fetchMicrosoftSpeech, type MicrosoftSpeechOptions } from './services/fetchMicrosoftSpeech';
export { fetchOpenaiSTT, type OpenaiSttOptions } from './services/fetchOpenaiSTT';
export { fetchOpenaiTTS, type OpenaiTtsOptions } from './services/fetchOpenaiTTS';
export { useAudioRecorder } from './useAudioRecorder';
export { useAzureSpeech } from './useAzureSpeech';
export { useEdgeSpeech } from './useEdgeSpeech';
export { useMicrosoftSpeech } from './useMicrosoftSpeech';
export { useOpenaiSTT, useOpenaiSTTWithPSR, useOpenaiSTTWithSR } from './useOpenaiSTT';
export { useOpenaiTTS } from './useOpenaiTTS';
export { usePersistedSpeechRecognition } from './useSpeechRecognition/usePersistedSpeechRecognition';
export { useSpeechRecognition } from './useSpeechRecognition/useSpeechRecognition';
Expand Down
2 changes: 1 addition & 1 deletion src/server/handleAzureSpeechRequest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
SpeechSynthesizer,
} from 'microsoft-cognitiveservices-speech-sdk';

import { AZURE_SPEECH_KEY, AZURE_SPEECH_REGION } from '@/const/api';
import { AZURE_SPEECH_KEY, AZURE_SPEECH_REGION } from '../const/api';

const fetchAzureSpeech = async (ssml: string, { api }: any): Promise<ArrayBuffer> => {
const key = api.key || AZURE_SPEECH_KEY;
Expand Down
2 changes: 1 addition & 1 deletion src/server/handleMicrosoftSpeechRequest.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { v4 as uuidv4 } from 'uuid';

import { MICROSOFT_SPPECH_URL } from '@/const/api';
import { MICROSOFT_SPPECH_URL } from '../const/api';

export const handleMicrosoftSpeechRequest = async (req: Request) => {
const DEFAULT_HEADERS = new Headers({
Expand Down
11 changes: 4 additions & 7 deletions src/services/fetchAzureSpeech.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { AZURE_SPEECH_PROXY_URL } from '@/const/api';
import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
import { type SsmlOptions, genSSML } from '@/utils/genSSML';

export interface AzureSpeechOptions extends SsmlOptions {
Expand All @@ -11,7 +12,7 @@ export interface AzureSpeechOptions extends SsmlOptions {
export const fetchAzureSpeech = async (
text: string,
{ api, ...options }: AzureSpeechOptions,
): Promise<AudioBufferSourceNode> => {
): Promise<Blob> => {
const data = JSON.stringify({
api,
ssml: genSSML(text, options),
Expand All @@ -28,10 +29,6 @@ export const fetchAzureSpeech = async (
throw new Error('Network response was not ok');
}

const audioData = await response.arrayBuffer();
const audioContext = new AudioContext();
const audioBufferSource = audioContext.createBufferSource();
audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
audioBufferSource.connect(audioContext.destination);
return audioBufferSource;
const arrayBuffer = await response.arrayBuffer();
return await arrayBufferConvert(arrayBuffer);
};
14 changes: 7 additions & 7 deletions src/services/fetchEdgeSpeech.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import qs from 'query-string';
import { v4 as uuidv4 } from 'uuid';

import { EDDGE_API_TOKEN, EDDGE_PROXY_URL } from '@/const/api';
import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
import { type SsmlOptions, genSSML } from '@/utils/genSSML';
import { genSendContent } from '@/utils/genSendContent';
import { getHeadersAndData } from '@/utils/getHeadersAndData';
Expand All @@ -12,11 +13,12 @@ export interface EdgeSpeechOptions extends Pick<SsmlOptions, 'name'> {
proxy: string;
};
}
export const fetchEdgeSpeech = async (text: string, { api, ...options }: EdgeSpeechOptions) => {
export const fetchEdgeSpeech = async (
text: string,
{ api, ...options }: EdgeSpeechOptions,
): Promise<Blob> => {
const connectId = uuidv4().replaceAll('-', '');
const date = new Date().toString();
const audioContext = new AudioContext();
const audioBufferSource = audioContext.createBufferSource();

const ws = new WebSocket(
qs.stringifyUrl({
Expand Down Expand Up @@ -61,7 +63,7 @@ export const fetchEdgeSpeech = async (text: string, { api, ...options }: EdgeSpe
);
});

return new Promise<AudioBufferSourceNode>((resolve) => {
return new Promise((resolve) => {
let audioData = new ArrayBuffer(0);
let downloadAudio = false;

Expand All @@ -77,9 +79,7 @@ export const fetchEdgeSpeech = async (text: string, { api, ...options }: EdgeSpe
case 'turn.end': {
downloadAudio = false;
if (!audioData.byteLength) return;
audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
audioBufferSource.connect(audioContext.destination);
resolve(audioBufferSource);
resolve(await arrayBufferConvert(audioData));
break;
}
}
Expand Down
11 changes: 4 additions & 7 deletions src/services/fetchMicrosoftSpeech.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { MICROSOFT_SPEECH_PROXY_URL } from '@/const/api';
import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
import { type SsmlOptions } from '@/utils/genSSML';
import { genSSML } from '@/utils/genSSML';

Expand All @@ -9,7 +10,7 @@ export interface MicrosoftSpeechOptions extends SsmlOptions {
export const fetchMicrosoftSpeech = async (
text: string,
{ api, ...options }: MicrosoftSpeechOptions,
): Promise<AudioBufferSourceNode> => {
): Promise<Blob> => {
const data = JSON.stringify({
offsetInPlainText: 0,
properties: {
Expand All @@ -30,10 +31,6 @@ export const fetchMicrosoftSpeech = async (
throw new Error('Network response was not ok');
}

const audioData = await response.arrayBuffer();
const audioContext = new AudioContext();
const audioBufferSource = audioContext.createBufferSource();
audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
audioBufferSource.connect(audioContext.destination);
return audioBufferSource;
const arrayBuffer = await response.arrayBuffer();
return await arrayBufferConvert(arrayBuffer);
};
18 changes: 11 additions & 7 deletions src/services/fetchOpenaiSTT.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ import { v4 as uuidv4 } from 'uuid';
import { OPENAI_API_KEY, OPENAI_STT_URL } from '@/const/api';

export interface OpenaiSttOptions {
api: {
api?: {
key: string;
proxy: string;
proxy?: string;
};
model?: 'whisper-1';
}
Expand All @@ -15,16 +15,18 @@ export const fetchOpenaiSTT = async (
speech: Blob,
{ api, model = 'whisper-1' }: OpenaiSttOptions,
): Promise<string> => {
const key = api.key || OPENAI_API_KEY;
const url = OPENAI_STT_URL(api.proxy);
const key = api?.key || OPENAI_API_KEY;
const url = OPENAI_STT_URL(api?.proxy);

const headers = new Headers({
'Authorization': `Bearer ${key}`,
'Content-Type': 'multipart/form-data',
Authorization: `Bearer ${key}`,
});

const filename = `${uuidv4()}.webm`;
const file = new File([speech], filename, { type: 'audio/webm' });

const body = new FormData();
body.append('file', speech, `${uuidv4()}.webm`);
body.append('file', file);
body.append('model', model);

const response: Response = await fetch(url, { body, headers, method: 'POST' });
Expand All @@ -35,5 +37,7 @@ export const fetchOpenaiSTT = async (

const json = await response.json();

console.log(json);

return json?.text;
};
13 changes: 4 additions & 9 deletions src/services/fetchOpenaiTTS.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { OPENAI_API_KEY, OPENAI_TTS_URL } from '@/const/api';
import { arrayBufferConvert } from '@/utils/arrayBufferConvert';
import { type SsmlOptions } from '@/utils/genSSML';

export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
Expand All @@ -11,12 +12,10 @@ export interface OpenaiTtsOptions extends Pick<SsmlOptions, 'name'> {
model?: 'tts-1' | 'tts-1-hd';
name: OpenaiVoice;
}

// 纯文本生成语音
export const fetchOpenaiTTS = async (
text: string,
{ api, model = 'tts-1', ...options }: OpenaiTtsOptions,
): Promise<AudioBufferSourceNode> => {
): Promise<Blob> => {
const key = api.key || OPENAI_API_KEY;
const url = OPENAI_TTS_URL(api.proxy);

Expand All @@ -37,10 +36,6 @@ export const fetchOpenaiTTS = async (
throw new Error('Network response was not ok');
}

const audioData = await response.arrayBuffer();
const audioContext = new AudioContext();
const audioBufferSource = audioContext.createBufferSource();
audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
audioBufferSource.connect(audioContext.destination);
return audioBufferSource;
const arrayBuffer = await response.arrayBuffer();
return await arrayBufferConvert(arrayBuffer);
};
23 changes: 23 additions & 0 deletions src/useAudioRecorder/demos/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { useAudioRecorder } from '@lobehub/tts';
import { Icon } from '@lobehub/ui';
import { Button } from 'antd';
import { Mic, StopCircle } from 'lucide-react';
import { Flexbox } from 'react-layout-kit';

export default () => {
const { isRecording, start, stop, url, formattedTime } = useAudioRecorder();
return (
<Flexbox gap={8}>
{isRecording ? (
<Button block icon={<Icon icon={StopCircle} />} onClick={stop}>
Stop {formattedTime}
</Button>
) : (
<Button block icon={<Icon icon={Mic} />} onClick={start} type={'primary'}>
Record
</Button>
)}
{url && <audio controls src={url} />}
</Flexbox>
);
};
9 changes: 9 additions & 0 deletions src/useAudioRecorder/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
nav: Components
group: STT
title: useAudioRecorder
---

## hooks

<code src="./demos/index.tsx" center></code>
74 changes: 74 additions & 0 deletions src/useAudioRecorder/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import { useCallback, useMemo, useState } from 'react';

import { secondsToMinutesAndSeconds } from '@/utils/secondsToMinutesAndSeconds';

export const useAudioRecorder = (onBolbAvailable?: (blob: Blob) => void) => {
const [isRecording, setIsRecording] = useState(false);

const [time, setTime] = useState(0);
const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder>();
// eslint-disable-next-line no-undef
const [timerInterval, setTimerInterval] = useState<NodeJS.Timer>();
const [blob, setBlob] = useState<Blob>();

const _startTimer = useCallback(() => {
const interval = setInterval(() => {
setTime((time) => time + 1);
}, 1000);
setTimerInterval(interval);
}, []);

const _stopTimer = useCallback(() => {
// @ts-ignore
// eslint-disable-next-line @typescript-eslint/no-unused-expressions
timerInterval !== undefined && clearInterval(timerInterval);
// @ts-ignore
setTimerInterval();
}, [timerInterval]);

const start = useCallback(() => {
setBlob(undefined);
if (timerInterval !== undefined) return;

navigator.mediaDevices
.getUserMedia({ audio: true })
.then((stream) => {
setIsRecording(true);
const recorder: MediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' });
setMediaRecorder(recorder);
recorder.start();
_startTimer();

recorder.addEventListener('dataavailable', (event) => {
setBlob(event.data);
onBolbAvailable?.(event.data);
recorder.stream.getTracks().forEach((t) => t.stop());
// @ts-ignore
setMediaRecorder();
});
})
.catch((error: DOMException) => {
console.log(error.name, error.message, error.cause);
});
}, [timerInterval, _startTimer]);

const stop = useCallback(() => {
mediaRecorder?.stop();
_stopTimer();
setTime(0);
setIsRecording(false);
}, [mediaRecorder, _stopTimer]);

const url = useMemo(() => blob && URL.createObjectURL(blob), [blob]);

return {
blob,
formattedTime: secondsToMinutesAndSeconds(time),
isRecording,
mediaRecorder,
start,
stop,
time,
url,
};
};
3 changes: 2 additions & 1 deletion src/useAzureSpeech/demos/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ export default () => {
},
{ store },
);
const { setText, isLoading, isPlaying, start, stop } = useAzureSpeech(defaultText, {
const { setText, isLoading, isPlaying, start, stop, url } = useAzureSpeech(defaultText, {
api,
...options,
});
Expand All @@ -81,6 +81,7 @@ export default () => {
</Button>
)}
<Input.TextArea defaultValue={defaultText} onChange={(e) => setText(e.target.value)} />
{url && <audio controls src={url} />}
</Flexbox>
</StoryBook>
);
Expand Down
Loading

0 comments on commit e6b1916

Please sign in to comment.