Skip to content

Commit

Permalink
feat: add realtimeAudioMinSec option
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Dec 15, 2023
1 parent 8d5c325 commit 85066fc
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 8 deletions.
9 changes: 7 additions & 2 deletions android/src/main/java/com/rnwhisper/WhisperContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ public int startRealtimeTranscribe(int jobId, ReadableMap options) {
final int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < audioSec ? realtimeAudioSliceSec : audioSec;
isUseSlices = audioSliceSec < audioSec;

double realtimeAudioMinSec = options.hasKey("realtimeAudioMinSec") ? options.getDouble("realtimeAudioMinSec") : 0;
final double audioMinSec = realtimeAudioMinSec > 0.5 && realtimeAudioMinSec <= audioSliceSec ? realtimeAudioMinSec : 1;

createRealtimeTranscribeJob(jobId, context, options);

sliceNSamples = new ArrayList<Integer>();
Expand Down Expand Up @@ -144,7 +147,8 @@ public void run() {
) {
finishRealtimeTranscribe(Arguments.createMap());
} else if (!isTranscribing) {
if (!vad(sliceIndex, nSamples, 0)) {
boolean isSamplesEnough = nSamples / SAMPLE_RATE >= audioMinSec;
if (!isSamplesEnough || !vad(sliceIndex, nSamples, 0)) {
finishRealtimeTranscribe(Arguments.createMap());
break;
}
Expand All @@ -169,7 +173,8 @@ public void run() {
nSamples += n;
sliceNSamples.set(sliceIndex, nSamples);

if (!isSpeech) continue;
boolean isSamplesEnough = nSamples / SAMPLE_RATE >= audioMinSec;
if (!isSamplesEnough || !isSpeech) continue;

if (!isTranscribing && nSamples > SAMPLE_RATE / 2) {
isTranscribing = true;
Expand Down
5 changes: 3 additions & 2 deletions android/src/main/jni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ struct whisper_full_params createFullParams(JNIEnv *env, jobject options) {
params.print_progress = false;
params.print_timestamps = false;
params.print_special = false;

int max_threads = std::thread::hardware_concurrency();
// Use 2 threads by default on 4-core devices, 4 threads on more cores
int default_n_threads = max_threads == 4 ? 2 : min(4, max_threads);
Expand Down Expand Up @@ -307,7 +307,7 @@ Java_com_rnwhisper_WhisperContext_fullWithNewJob(
// whisper_print_timings(context);
}
env->ReleaseFloatArrayElements(audio_data, audio_data_arr, JNI_ABORT);

if (job->is_aborted()) code = -999;
rnwhisper::job_remove(job_id);
return code;
Expand Down Expand Up @@ -339,6 +339,7 @@ Java_com_rnwhisper_WhisperContext_createRealtimeTranscribeJob(
vad,
readablemap::getInt(env, options, "realtimeAudioSec", 0),
readablemap::getInt(env, options, "realtimeAudioSliceSec", 0),
readablemap::getFloat(env, options, "realtimeAudioMinSec", 0),
audio_output_path_str
);
}
Expand Down
2 changes: 2 additions & 0 deletions cpp/rn-whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ void job::set_realtime_params(
vad_params params,
int sec,
int slice_sec,
float min_sec,
const char* output_path
) {
vad = params;
if (vad.vad_ms < 2000) vad.vad_ms = 2000;
audio_sec = sec > 0 ? sec : DEFAULT_MAX_AUDIO_SEC;
audio_slice_sec = slice_sec > 0 && slice_sec < audio_sec ? slice_sec : audio_sec;
audio_min_sec = min_sec >= 0.5 && min_sec <= audio_slice_sec ? min_sec : 1.0f;
audio_output_path = output_path;
}

Expand Down
5 changes: 3 additions & 2 deletions cpp/rn-whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ struct job {
vad_params vad;
int audio_sec = 0;
int audio_slice_sec = 0;
float audio_min_sec = 0;
const char* audio_output_path = nullptr;
std::vector<short *> pcm_slices;
void set_realtime_params(vad_params vad, int sec, int slice_sec, const char* output_path);
void set_realtime_params(vad_params vad, int sec, int slice_sec, float min_sec, const char* output_path);
bool vad_simple(int slice_index, int n_samples, int n);
void put_pcm_data(short* pcm, int slice_index, int n_samples, int n);
float* pcm_slice_to_f32(int slice_index, int size);
Expand All @@ -46,4 +47,4 @@ job* job_get(int job_id);

} // namespace rnwhisper

#endif // RNWHISPER_H
#endif // RNWHISPER_H
7 changes: 5 additions & 2 deletions ios/RNWhisperContext.mm
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ - (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
},
options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
);
self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
Expand Down Expand Up @@ -181,7 +182,8 @@ void AudioInputCallback(void * inUserData,
!state->isTranscribing &&
nSamples != state->nSamplesTranscribing
) {
if (!vad(state, state->sliceIndex, nSamples, 0)) {
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
[state->mSelf finishRealtimeTranscribe:state result:@{}];
return;
}
Expand Down Expand Up @@ -210,7 +212,8 @@ void AudioInputCallback(void * inUserData,

AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);

if (!isSpeech) return;
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
if (!isSamplesEnough || !isSpeech) return;

if (!state->isTranscribing) {
state->isTranscribing = true;
Expand Down
5 changes: 5 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
* (Default: Equal to `realtimeMaxAudioSec`)
*/
realtimeAudioSliceSec?: number
/**
* Min duration of audio to start transcribe in seconds for each slice.
* The minimum value is 0.5 ms and maximum value is realtimeAudioSliceSec (Default: 1)
*/
realtimeAudioMinSec?: number
/**
* Output path for audio file. If not set, the audio file will not be saved
* (Default: Undefined)
Expand Down

0 comments on commit 85066fc

Please sign in to comment.