Bad langauge detection on code switch and diarization in multi-language mode #874
-
|
Hi, I just used your new multi-language model, kudos. I have a 12 second file with two speakers, one male talking in English, stopping, then another speaker, female, speaks Spanish. I tried Nova-2 with diarization in multi-language mode ( Is there a way to get better performance on this? My STT request was: The response was: (Note how all the words are attributed to speaker 0 {
"metadata": {
"transaction_key": "deprecated",
"request_id": "***",
"sha256": "539e08b6250d9ccdc960261a80fc20a32cb07830080edb57f8141dea581134cf",
"created": "2024-07-30T15:53:37.149Z",
"duration": 12.223,
"channels": 1,
"models": [
"dc8a3fe5-a395-4b75-a8b1-71c9a5a87526"
],
"model_info": {
"dc8a3fe5-a395-4b75-a8b1-71c9a5a87526": {
"name": "2-general-nova",
"version": "1999-06-13.21385",
"arch": "nova-2"
}
}
},
"results": {
"channels": [
{
"alternatives": [
{
"transcript": "Hi, how are you? No, puedo hablar mucho, pero cuando hablo, sé lo mejor.",
"confidence": 0.96240234,
"languages": [
"en",
"es"
],
"words": [
{
"word": "hi",
"start": 0.96,
"end": 1.04,
"confidence": 0.88964844,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "Hi,",
"language": "en"
},
{
"word": "how",
"start": 1.04,
"end": 1.36,
"confidence": 0.9902344,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "how",
"language": "en"
},
{
"word": "are",
"start": 1.36,
"end": 1.4399999,
"confidence": 0.9980469,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "are",
"language": "en"
},
{
"word": "you",
"start": 1.4399999,
"end": 1.9399999,
"confidence": 0.99316406,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "you?",
"language": "en"
},
{
"word": "no",
"start": 3.6,
"end": 4,
"confidence": 0.7141113,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "No,",
"language": "en"
},
{
"word": "puedo",
"start": 4,
"end": 4.24,
"confidence": 0.9941406,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "puedo",
"language": "en"
},
{
"word": "hablar",
"start": 4.24,
"end": 4.56,
"confidence": 0.99902344,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "hablar",
"language": "en"
},
{
"word": "mucho",
"start": 4.56,
"end": 5.06,
"confidence": 0.76708984,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "mucho,",
"language": "en"
},
{
"word": "pero",
"start": 6.5879374,
"end": 7.0879374,
"confidence": 0.9951172,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "pero",
"language": "es"
},
{
"word": "cuando",
"start": 7.5479374,
"end": 8.047937,
"confidence": 0.94189453,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "cuando",
"language": "es"
},
{
"word": "hablo",
"start": 9.147938,
"end": 9.647938,
"confidence": 0.82421875,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "hablo,",
"language": "es"
},
{
"word": "sé",
"start": 10.267937,
"end": 10.667937,
"confidence": 0.29248047,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "sé",
"language": "es"
},
{
"word": "lo",
"start": 10.667937,
"end": 10.987937,
"confidence": 0.96240234,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "lo",
"language": "es"
},
{
"word": "mejor",
"start": 10.987937,
"end": 11.487937,
"confidence": 0.94677734,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "mejor.",
"language": "es"
}
],
"paragraphs": {
"transcript": "\nSpeaker 0: Hi, how are you? No, puedo hablar mucho, pero cuando hablo, sé lo mejor.",
"paragraphs": [
{
"sentences": [
{
"text": "Hi, how are you?",
"start": 0.96,
"end": 1.9399999
},
{
"text": "No, puedo hablar mucho, pero cuando hablo, sé lo mejor.",
"start": 3.6,
"end": 11.487937
}
],
"speaker": 0,
"num_words": 14,
"start": 0.96,
"end": 11.487937
}
]
}
}
]
}
],
"utterances": [
{
"start": 0.96,
"end": 1.9399999,
"confidence": 0.96777344,
"channel": 0,
"transcript": "Hi, how are you?",
"words": [
{
"word": "hi",
"start": 0.96,
"end": 1.04,
"confidence": 0.88964844,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "Hi,",
"language": "en"
},
{
"word": "how",
"start": 1.04,
"end": 1.36,
"confidence": 0.9902344,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "how",
"language": "en"
},
{
"word": "are",
"start": 1.36,
"end": 1.4399999,
"confidence": 0.9980469,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "are",
"language": "en"
},
{
"word": "you",
"start": 1.4399999,
"end": 1.9399999,
"confidence": 0.99316406,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "you?",
"language": "en"
}
],
"speaker": 0,
"id": "e82f0ae3-b3c0-453b-bcb4-66b77096abdc"
},
{
"start": 3.6,
"end": 5.06,
"confidence": 0.8685913,
"channel": 0,
"transcript": "No, puedo hablar mucho,",
"words": [
{
"word": "no",
"start": 3.6,
"end": 4,
"confidence": 0.7141113,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "No,",
"language": "en"
},
{
"word": "puedo",
"start": 4,
"end": 4.24,
"confidence": 0.9941406,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "puedo",
"language": "en"
},
{
"word": "hablar",
"start": 4.24,
"end": 4.56,
"confidence": 0.99902344,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "hablar",
"language": "en"
},
{
"word": "mucho",
"start": 4.56,
"end": 5.06,
"confidence": 0.76708984,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "mucho,",
"language": "en"
}
],
"speaker": 0,
"id": "d7307887-2776-45c3-a7ec-867c88904f49"
},
{
"start": 6.5879374,
"end": 7.0879374,
"confidence": 0.9951172,
"channel": 0,
"transcript": "pero",
"words": [
{
"word": "pero",
"start": 6.5879374,
"end": 7.0879374,
"confidence": 0.9951172,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "pero",
"language": "es"
}
],
"speaker": 0,
"id": "05fae306-c957-4007-9c65-9e02926af5d6"
},
{
"start": 7.5479374,
"end": 8.047937,
"confidence": 0.94189453,
"channel": 0,
"transcript": "cuando",
"words": [
{
"word": "cuando",
"start": 7.5479374,
"end": 8.047937,
"confidence": 0.94189453,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "cuando",
"language": "es"
}
],
"speaker": 0,
"id": "6aa8490c-c929-44f7-9ac1-bd67320d5994"
},
{
"start": 9.147938,
"end": 9.647938,
"confidence": 0.82421875,
"channel": 0,
"transcript": "hablo,",
"words": [
{
"word": "hablo",
"start": 9.147938,
"end": 9.647938,
"confidence": 0.82421875,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "hablo,",
"language": "es"
}
],
"speaker": 0,
"id": "494dc86b-e605-48d2-825a-0688a04be680"
},
{
"start": 10.267937,
"end": 11.487937,
"confidence": 0.7338867,
"channel": 0,
"transcript": "sé lo mejor.",
"words": [
{
"word": "sé",
"start": 10.267937,
"end": 10.667937,
"confidence": 0.29248047,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "sé",
"language": "es"
},
{
"word": "lo",
"start": 10.667937,
"end": 10.987937,
"confidence": 0.96240234,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "lo",
"language": "es"
},
{
"word": "mejor",
"start": 10.987937,
"end": 11.487937,
"confidence": 0.94677734,
"speaker": 0,
"speaker_confidence": 0.7109375,
"punctuated_word": "mejor.",
"language": "es"
}
],
"speaker": 0,
"id": "82e8a29a-5655-48af-929d-db17c1655410"
}
]
}
} |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 1 reply
-
|
Thanks for asking your question about Deepgram! If you didn't already include it in your post, please be sure to add as much detail as possible so we can assist you efficiently, such as:
|
Beta Was this translation helpful? Give feedback.
-
|
Hi @shenberg, there's an edge case where the english/spanish words can get assigned to the opposite language. We're aware and have a plan for the fix, but it'll be a bit before it's updated. The diarization is also something we're working on. Thanks for the report and example! |
Beta Was this translation helpful? Give feedback.
Hi @shenberg, there's an edge case where the english/spanish words can get assigned to the opposite language. We're aware and have a plan for the fix, but it'll be a bit before it's updated. The diarization is also something we're working on. Thanks for the report and example!