translate to english option for whisper

mkiol · Dec 9, 2023 · 2d52f56 · 2d52f56
1 parent 74e6c77
commit 2d52f56
Show file tree

Hide file tree

Showing 11 changed files with 84 additions and 12 deletions.
diff --git a/desktop/dsnote.metainfo.xml b/desktop/dsnote.metainfo.xml
@@ -104,6 +104,7 @@
       <li>New version of Faster Whisper Large model: 'FasterWhisper Large-v3'</li>
       <li>Whisper and Faster Whisper enabled for Chinese-Cantonese language</li>
       <li>Support for Speex audio codec in 'Transcribe a file'</li>
+      <li>Translate to English option for Whisper and Faster Whisper models.</li>
       </ul>
       <p>Text to Speech:</p>
       <ul>

diff --git a/desktop/qml/ChangelogPage.qml b/desktop/qml/ChangelogPage.qml
@@ -33,6 +33,9 @@ DialogPage {
         <li>New version of Faster Whisper Large model: <i>FasterWhisper Large-v3</i></li>
         <li>Whisper and Faster Whisper enabled for Chinese-Cantonese language (广东话)</li>
         <li>Support for Speex audio codec in <i>Transcribe a file</i></li>
+        <li>Translate to English option for Whisper and Faster Whisper models.
+            To automatically translate to English, use the switch to the right of the model selection box.
+            The option is only visible if you select a non-English Whisper or Faster Whisper model.</li>
         </ul>
         <p>" + qsTr("Text to Speech") + ":</p>
         <ul>

diff --git a/desktop/qml/ComboButton.qml b/desktop/qml/ComboButton.qml
@@ -18,10 +18,12 @@ RowLayout {
     property alias combo: _combo
     property alias combo2: _combo2
     property alias combo3: _combo3
+    property alias check: _check
     property alias frame: _frame
     property string comboToolTip: ""
     property string combo2ToolTip: ""
     property string combo3ToolTip: ""
+    property string checkToolTip: ""
     property string buttonToolTip: ""
     readonly property bool off: combo.model.length === 0
     property string comboPlaceholderText: ""
@@ -96,6 +98,17 @@ RowLayout {
                 ToolTip.text: root.combo3ToolTip
             }
 
+            Switch {
+                id: _check
+
+                visible: false
+                Layout.alignment: Qt.AlignVCenter | Qt.AlignRight
+                enabled: !root.off
+                ToolTip.visible: hovered
+                //ToolTip.delay: Qt.styleHints.mousePressAndHoldInterval
+                ToolTip.text: root.checkToolTip
+            }
+
             Button {
                 id: _button
 

diff --git a/desktop/qml/Notepad.qml b/desktop/qml/Notepad.qml
@@ -39,6 +39,7 @@ ColumnLayout {
 
         if (app.stt_configured) {
             listenReadCombos.first.combo.currentIndex = app.active_stt_model_idx
+            listenReadCombos.first.check.checked = _settings.whisper_translate
         }
         if (app.tts_configured) {
             listenReadCombos.second.combo.currentIndex = app.active_tts_model_idx
@@ -93,11 +94,19 @@ ColumnLayout {
                                             app.state === DsnoteApp.StateListeningManual)
             comboToolTip: qsTr("Speech to Text model")
             comboPlaceholderText: qsTr("No Speech to Text model")
+            checkToolTip: qsTr("Translate to English")
             combo {
                 model: app.available_stt_models
                 onActivated: app.set_active_stt_model_idx(index)
                 currentIndex: app.active_stt_model_idx
             }
+            check {
+                visible: app.stt_translate_needed
+                checked: _settings.whisper_translate
+                onClicked: {
+                    _settings.whisper_translate = !_settings.whisper_translate
+                }
+            }
             button {
                 text: qsTr("Listen")
                 onClicked: {
@@ -180,7 +189,6 @@ ColumnLayout {
                 }
                 onActivated: _settings.speech_speed = index + 1
             }
-
             button {
                 enabled: listenReadCombos.second.enabled &&
                          !listenReadCombos.second.off &&

diff --git a/src/dsnote_app.cpp b/src/dsnote_app.cpp
@@ -1794,12 +1794,15 @@ void dsnote_app::transcribe_file(const QString &source_file, bool replace) {
     int new_task = 0;
 
     if (s->launch_mode() == settings::launch_mode_t::app_stanalone) {
-        new_task = speech_service::instance()->stt_transcribe_file(source_file,
-                                                                   {}, {});
+        new_task = speech_service::instance()->stt_transcribe_file(
+            source_file, {},
+            s->whisper_translate() ? QStringLiteral("en") : QString{});
     } else {
         qDebug() << "[app => dbus] call SttTranscribeFile";
 
-        new_task = m_dbus_service.SttTranscribeFile(source_file, {}, {});
+        new_task = m_dbus_service.SttTranscribeFile(
+            source_file, {},
+            s->whisper_translate() ? QStringLiteral("en") : QString{});
     }
 
     m_side_task.set(new_task);
@@ -1832,11 +1835,12 @@ void dsnote_app::listen_internal() {
     if (s->launch_mode() == settings::launch_mode_t::app_stanalone) {
         new_task = speech_service::instance()->stt_start_listen(
             static_cast<speech_service::speech_mode_t>(s->speech_mode()), {},
-            {});
+            s->whisper_translate() ? QStringLiteral("en") : QString{});
     } else {
         qDebug() << "[app => dbus] call SttStartListen:" << s->speech_mode();
         new_task = m_dbus_service.SttStartListen(
-            static_cast<int>(s->speech_mode()), {}, {});
+            static_cast<int>(s->speech_mode()), {},
+            s->whisper_translate() ? QStringLiteral("en") : QString{});
     }
 
     m_primary_task.set(new_task);
@@ -3157,6 +3161,15 @@ QString dsnote_app::download_content(const QUrl &url) {
     return text;
 }
 
+bool dsnote_app::stt_translate_needed_by_id(const QString &id) const {
+    if (m_available_stt_models_map.contains(id)) {
+        auto model = m_available_stt_models_map.value(id).toStringList();
+        return model.size() > 2 && model.at(2).contains('t');
+    }
+
+    return false;
+}
+
 bool dsnote_app::tts_ref_voice_needed_by_id(const QString &id) const {
     if (m_available_tts_models_map.contains(id)) {
         auto model = m_available_tts_models_map.value(id).toStringList();
@@ -3166,6 +3179,10 @@ bool dsnote_app::tts_ref_voice_needed_by_id(const QString &id) const {
     return false;
 }
 
+bool dsnote_app::stt_translate_needed() const {
+    return stt_translate_needed_by_id(m_active_stt_model);
+}
+
 bool dsnote_app::tts_ref_voice_needed() const {
     return tts_ref_voice_needed_by_id(m_active_tts_model);
 }

diff --git a/src/dsnote_app.h b/src/dsnote_app.h
@@ -68,6 +68,8 @@ class dsnote_app : public QObject {
                    active_stt_model_changed)
     Q_PROPERTY(QVariantList available_stt_models READ available_stt_models
                    NOTIFY available_stt_models_changed)
+    Q_PROPERTY(bool stt_translate_needed READ stt_translate_needed NOTIFY
+                   active_stt_model_changed)
 
     // tts ref voices
     Q_PROPERTY(int active_tts_ref_voice_idx READ active_tts_ref_voice_idx NOTIFY
@@ -657,7 +659,9 @@ class dsnote_app : public QObject {
     bool feature_text_active_window() const;
     bool feature_coqui_tts() const;
     void request_reload();
+    bool stt_translate_needed_by_id(const QString &id) const;
     bool tts_ref_voice_needed_by_id(const QString &id) const;
+    bool stt_translate_needed() const;
     bool tts_ref_voice_needed() const;
     bool tts_for_in_mnt_ref_voice_needed() const;
     bool tts_for_out_mnt_ref_voice_needed() const;

diff --git a/src/fasterwhisper_engine.cpp b/src/fasterwhisper_engine.cpp
@@ -272,7 +272,9 @@ void fasterwhisper_engine::decode_speech(const whisper_buf_t& buf) {
 
                          auto seg_tuple = m_model->attr("transcribe")(
                              "audio"_a = array, "beam_size"_a = 5,
-                             "language"_a = m_config.lang);
+                             "language"_a = m_config.lang,
+                             "task"_a = m_config.translate ? "translate"
+                                                           : "transcribe");
 
                          auto segments = *seg_tuple.cast<py::list>().begin();
 

diff --git a/src/models_manager.cpp b/src/models_manager.cpp
@@ -1608,6 +1608,7 @@ models_manager::feature_flags models_manager::add_explicit_feature_flags(
                 existing_features, feature_flags::medium_processing);
             existing_features = add_new_feature(existing_features,
                                                 feature_flags::medium_quality);
+            break;
         case model_engine_t::tts_piper:
             existing_features =
                 add_new_feature(existing_features, feature_flags::high_quality);
@@ -1947,10 +1948,15 @@ auto models_manager::extract_models(
             }
         }
 
-        // add char replacement option for all coqui tts models
         if (model.engine == model_engine_t::tts_coqui &&
             !model.options.contains('c')) {
+            // add char replacement option for all coqui tts models
             model.options.push_back('c');
+        } else if ((model.engine == model_engine_t::stt_whisper ||
+                    model.engine == model_engine_t::stt_fasterwhisper) &&
+                   !model.options.contains('t') && model.lang_id != "en") {
+            // add translate option for all whisper stt models
+            model.options.push_back('t');
         }
 
         models.emplace(model_id, std::move(model));

diff --git a/src/settings.cpp b/src/settings.cpp
@@ -1437,6 +1437,17 @@ void settings::set_mnt_clean_text(bool value) {
     }
 }
 
+bool settings::whisper_translate() const {
+    return value(QStringLiteral("whisper_translate"), false).toBool();
+}
+
+void settings::set_whisper_translate(bool value) {
+    if (value != whisper_translate()) {
+        setValue(QStringLiteral("whisper_translate"), value);
+        emit whisper_translate_changed();
+    }
+}
+
 bool settings::gpu_scan_hip() const {
     return value(QStringLiteral("gpu_scan_hip"), true).toBool();
 }

diff --git a/src/settings.h b/src/settings.h
@@ -141,6 +141,8 @@ class settings : public QSettings, public singleton<settings> {
                            active_tts_for_out_mnt_ref_voice_changed)
     Q_PROPERTY(bool mnt_clean_text READ mnt_clean_text WRITE set_mnt_clean_text
                    NOTIFY mnt_clean_text_changed)
+    Q_PROPERTY(bool whisper_translate READ whisper_translate WRITE
+                   set_whisper_translate NOTIFY whisper_translate_changed)
 
     // service
     Q_PROPERTY(QString models_dir READ models_dir WRITE set_models_dir NOTIFY
@@ -372,6 +374,8 @@ class settings : public QSettings, public singleton<settings> {
     void set_active_tts_for_out_mnt_ref_voice(const QString &value);
     bool mnt_clean_text() const;
     void set_mnt_clean_text(bool value);
+    bool whisper_translate() const;
+    void set_whisper_translate(bool value);
 
     Q_INVOKABLE QUrl app_icon() const;
     Q_INVOKABLE bool py_supported() const;
@@ -505,6 +509,7 @@ class settings : public QSettings, public singleton<settings> {
     void active_tts_for_in_mnt_ref_voice_changed();
     void active_tts_for_out_mnt_ref_voice_changed();
     void mnt_clean_text_changed();
+    void whisper_translate_changed();
 
     // service
     void models_dir_changed();

diff --git a/src/speech_service.cpp b/src/speech_service.cpp
@@ -1023,9 +1023,9 @@ static std::optional<typename Engine::gpu_device_t> make_gpu_device(
     return std::nullopt;
 }
 
-QString speech_service::restart_stt_engine(
-    speech_mode_t speech_mode, const QString &model_id,
-    [[maybe_unused]] const QString &out_lang_id) {
+QString speech_service::restart_stt_engine(speech_mode_t speech_mode,
+                                           const QString &model_id,
+                                           const QString &out_lang_id) {
     auto model_config = choose_model_config(engine_t::stt, model_id);
     if (model_config && model_config->stt) {
         stt_engine::config_t config;
@@ -1041,7 +1041,8 @@ QString speech_service::restart_stt_engine(
         config.lang_code = model_config->stt->lang_code.toStdString();
         config.speech_mode =
             static_cast<stt_engine::speech_mode_t>(speech_mode);
-        config.translate = false;
+        config.translate = !out_lang_id.isEmpty() && out_lang_id == "en" &&
+                           config.lang != "en";
         config.options = model_config->options.toStdString();
 
         if (settings::instance()->stt_use_gpu() &&
@@ -1081,6 +1082,7 @@ QString speech_service::restart_stt_engine(
 
             if (m_stt_engine->model_files() != config.model_files) return true;
             if (m_stt_engine->lang() != config.lang) return true;
+            if (m_stt_engine->translate() != config.translate) return true;
             if (config.use_gpu != m_stt_engine->use_gpu() ||
                 config.gpu_device != m_stt_engine->gpu_device())
                 return true;