In [1]:
import pyaudio

# PyAudio 객체 생성
p = pyaudio.PyAudio()

# 연결된 모든 오디오 입력 장치 정보 출력
print("Available audio input devices:")
for i in range(p.get_device_count()):
    device_info = p.get_device_info_by_index(i)
    if device_info["maxInputChannels"] > 0:  # 입력 채널이 있는 장치만 출력
        print(f"Device ID: {i}")
        print(f"  Name: {device_info['name']}")
        print(f"  Max Input Channels: {device_info['maxInputChannels']}")
        print(f"  Default Sample Rate: {device_info['defaultSampleRate']}")
        print()

# PyAudio 종료
p.terminate()



Available audio input devices:
Device ID: 0
  Name: Microsoft Sound Mapper - Input
  Max Input Channels: 2
  Default Sample Rate: 44100.0

Device ID: 1
  Name: 마이크(2- MATA STUDIO C10)
  Max Input Channels: 2
  Default Sample Rate: 44100.0

Device ID: 2
  Name: Britz 마이크(Britz BR-MICBAR)
  Max Input Channels: 1
  Default Sample Rate: 44100.0

Device ID: 8
  Name: 주 사운드 캡처 드라이버
  Max Input Channels: 2
  Default Sample Rate: 44100.0

Device ID: 9
  Name: 마이크(2- MATA STUDIO C10)
  Max Input Channels: 2
  Default Sample Rate: 44100.0

Device ID: 10
  Name: Britz 마이크(Britz BR-MICBAR)
  Max Input Channels: 1
  Default Sample Rate: 44100.0

Device ID: 20
  Name: 마이크(2- MATA STUDIO C10)
  Max Input Channels: 2
  Default Sample Rate: 48000.0

Device ID: 21
  Name: Britz 마이크(Britz BR-MICBAR)
  Max Input Channels: 1
  Default Sample Rate: 48000.0

Device ID: 24
  Name: 마이크 (MATA STUDIO C10)
  Max Input Channels: 2
  Default Sample Rate: 44100.0

Device ID: 27
  Name: 마이크 (Britz BR-MICBAR)
  Max In

MODEL 1

In [1]:
from PyQt5 import QtCore, QtGui, QtWidgets
import webrtcvad
import pyaudio
import numpy as np
import wave
import noisereduce as nr
from faster_whisper import WhisperModel
import threading
import time

class STTWorker(QtCore.QThread):
    text_update = QtCore.pyqtSignal(str)  # Signal to update text in the UI
    stt_started = QtCore.pyqtSignal()     # Signal to indicate STT processing started
    stt_finished = QtCore.pyqtSignal(str)  # Signal to indicate STT processing finished with the result

    def __init__(self, model_size="small"):
        super(STTWorker, self).__init__()
        self.vad = webrtcvad.Vad(3)
        self.model = WhisperModel(model_size, device="cpu")
        self.sample_rate = 16000
        self.frame_duration = 20
        self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
        self.channels = 1
        self.running = False

        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=pyaudio.paInt16, channels=self.channels,
                                  rate=self.sample_rate, input=True,
                                  frames_per_buffer=self.frame_size)

    def run(self):
        self.running = True
        while self.running:
            for segment in self.vad_segments():
                self.stt_started.emit()  # Emit signal to indicate STT processing started
                start_time = time.time()  # Record start time for this segment
                clean_segment = self.reduce_noise(segment)
                stt_result = self.transcribe_audio(clean_segment)
                duration = time.time() - start_time  # Calculate elapsed time for this segment
                for text_segment in stt_result:
                    result_with_time = f"{text_segment.text} ({duration:.2f}s)"
                    self.stt_finished.emit(result_with_time)  # Emit signal with duration and text

    def stop(self):
        self.running = False
        self.stream.stop_stream()
        self.stream.close()
        self.p.terminate()
        self.quit()  # Stop the thread

    def vad_segments(self):
        frames = []
        while self.running:
            audio_frame = self.stream.read(self.frame_size)
            is_speech = self.vad.is_speech(audio_frame, self.sample_rate)
            if is_speech:
                frames.append(audio_frame)
            elif frames:
                yield b''.join(frames)
                frames = []

    def reduce_noise(self, audio_data):
        audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
        reduced_noise = nr.reduce_noise(y=audio_array, sr=self.sample_rate)
        return reduced_noise.astype(np.int16).tobytes()

    def transcribe_audio(self, audio_data):
        audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
        segments, _ = self.model.transcribe(audio_array, language='ko')
        return segments

class Ui_STT(object):
    def setupUi(self, STT):
        STT.setObjectName("STT")
        STT.resize(600, 400)  # Increased the window size
        STT.setCursor(QtGui.QCursor(QtCore.Qt.ArrowCursor))
        
        # TextEdit
        self.textEdit = QtWidgets.QTextEdit(STT)
        self.textEdit.setGeometry(QtCore.QRect(20, 30, 560, 280))  # Adjusted for the new window size
        self.textEdit.setObjectName("textEdit")

        # Buttons
        self.pushButton = QtWidgets.QPushButton(STT)
        self.pushButton.setGeometry(QtCore.QRect(20, 330, 101, 61))
        self.pushButton.setObjectName("pushButton")
        
        self.pushButton_2 = QtWidgets.QPushButton(STT)
        self.pushButton_2.setGeometry(QtCore.QRect(140, 330, 101, 61))
        self.pushButton_2.setObjectName("pushButton_2")
        
        self.pushButton_3 = QtWidgets.QPushButton(STT)
        self.pushButton_3.setGeometry(QtCore.QRect(260, 330, 101, 61))
        self.pushButton_3.setObjectName("pushButton_3")

        self.pushButton_small = QtWidgets.QPushButton(STT)
        self.pushButton_small.setGeometry(QtCore.QRect(380, 330, 101, 61))
        self.pushButton_small.setObjectName("pushButton_small")
        
        self.pushButton_medium = QtWidgets.QPushButton(STT)
        self.pushButton_medium.setGeometry(QtCore.QRect(500, 330, 101, 61))
        self.pushButton_medium.setObjectName("pushButton_medium")

        self.retranslateUi(STT)
        QtCore.QMetaObject.connectSlotsByName(STT)

        # Connect buttons to their functions
        self.pushButton.clicked.connect(self.prepare_recording)
        self.pushButton_2.clicked.connect(self.stop_recording)
        self.pushButton_3.clicked.connect(self.clear_text)
        self.pushButton_small.clicked.connect(self.set_small_model)
        self.pushButton_medium.clicked.connect(self.set_medium_model)

        self.stt_worker = None
        self.model_size = "small"  # Default model size

    def retranslateUi(self, STT):
        _translate = QtCore.QCoreApplication.translate
        STT.setWindowTitle(_translate("STT", "STT"))
        self.pushButton.setText(_translate("STT", "녹음"))
        self.pushButton_2.setText(_translate("STT", "종료"))
        self.pushButton_3.setText(_translate("STT", "출력 초기화"))
        self.pushButton_small.setText(_translate("STT", "small"))
        self.pushButton_medium.setText(_translate("STT", "medium"))

    def prepare_recording(self):
        self.textEdit.append("Wait...")
        QtCore.QTimer.singleShot(3000, self.start_recording)  # 3초 후 녹음 시작

    def start_recording(self):
        if self.stt_worker is None or not self.stt_worker.isRunning():
            self.textEdit.append(f"녹음을 시작합니다... (모델: {self.model_size})")
            self.stt_worker = STTWorker(model_size=self.model_size)
            self.stt_worker.text_update.connect(self.update_text)
            self.stt_worker.stt_started.connect(self.show_processing)
            self.stt_worker.stt_finished.connect(self.update_text)
            self.stt_worker.start()

    def stop_recording(self):
        if self.stt_worker is not None:
            self.textEdit.append("녹음을 종료합니다...")
            self.stt_worker.stop()
            self.stt_worker = None

    def clear_text(self):
        self.textEdit.clear()

    def set_small_model(self):
        self.model_size = "small"
        self.textEdit.append("모델이 'small'로 설정되었습니다.")

    def set_medium_model(self):
        self.model_size = "medium"
        self.textEdit.append("모델이 'medium'로 설정되었습니다.")

    def update_text(self, text):
        self.textEdit.append(f"Speech: {text}")

    def show_processing(self):
        self.textEdit.append("번역중입니다...")

if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    STT = QtWidgets.QWidget()
    ui = Ui_STT()
    ui.setupUi(STT)
    STT.show()
    sys.exit(app.exec_())


  from .autonotebook import tqdm as notebook_tqdm


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


MODEL 2 (버튼눌러서 녹음)

In [1]:
from PyQt5 import QtCore, QtGui, QtWidgets
import webrtcvad
import pyaudio
import numpy as np
import wave
import noisereduce as nr
from faster_whisper import WhisperModel
import time

class STTWorker(QtCore.QThread):
    text_update = QtCore.pyqtSignal(str)  # Signal to update text in the UI
    stt_finished = QtCore.pyqtSignal(str)  # Signal to indicate STT processing finished with the result

    def __init__(self, model_size="small"):
        super(STTWorker, self).__init__()
        self.vad = webrtcvad.Vad(3)
        self.model = WhisperModel(model_size, device="cpu")
        self.sample_rate = 16000
        self.frame_duration = 20
        self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
        self.channels = 1
        self.running = False
        self.frames = []

        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=pyaudio.paInt16, channels=self.channels,
                                  rate=self.sample_rate, input=True,
                                  frames_per_buffer=self.frame_size)

    def run(self):
        self.running = True
        self.frames = []
        while self.running:
            audio_frame = self.stream.read(self.frame_size)
            self.frames.append(audio_frame)

    def stop(self):
        self.running = False
        self.process_audio()

    def process_audio(self):
        audio_data = b''.join(self.frames)
        start_time = time.time()
        clean_segment = self.reduce_noise(audio_data)
        stt_result = self.transcribe_audio(clean_segment)
        duration = time.time() - start_time
        for text_segment in stt_result:
            result_with_time = f"{text_segment.text} ({duration:.2f}s)"
            self.stt_finished.emit(result_with_time)

    def reduce_noise(self, audio_data):
        audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
        reduced_noise = nr.reduce_noise(y=audio_array, sr=self.sample_rate)
        return reduced_noise.astype(np.int16).tobytes()

    def transcribe_audio(self, audio_data):
        audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
        segments, _ = self.model.transcribe(audio_array, language='ko')
        return segments

class Ui_STT(object):
    def setupUi(self, STT):
        STT.setObjectName("STT")
        STT.resize(600, 400)  # Increased the window size
        STT.setCursor(QtGui.QCursor(QtCore.Qt.ArrowCursor))
        
        # TextEdit
        self.textEdit = QtWidgets.QTextEdit(STT)
        self.textEdit.setGeometry(QtCore.QRect(20, 30, 560, 280))  # Adjusted for the new window size
        self.textEdit.setObjectName("textEdit")

        # Buttons
        self.pushButton = QtWidgets.QPushButton(STT)
        self.pushButton.setGeometry(QtCore.QRect(20, 330, 101, 61))
        self.pushButton.setObjectName("pushButton")
        
        self.pushButton_2 = QtWidgets.QPushButton(STT)
        self.pushButton_2.setGeometry(QtCore.QRect(140, 330, 101, 61))
        self.pushButton_2.setObjectName("pushButton_2")
        
        self.pushButton_3 = QtWidgets.QPushButton(STT)
        self.pushButton_3.setGeometry(QtCore.QRect(260, 330, 101, 61))
        self.pushButton_3.setObjectName("pushButton_3")

        self.pushButton_small = QtWidgets.QPushButton(STT)
        self.pushButton_small.setGeometry(QtCore.QRect(380, 330, 101, 61))
        self.pushButton_small.setObjectName("pushButton_small")
        
        self.pushButton_medium = QtWidgets.QPushButton(STT)
        self.pushButton_medium.setGeometry(QtCore.QRect(500, 330, 101, 61))
        self.pushButton_medium.setObjectName("pushButton_medium")

        self.retranslateUi(STT)
        QtCore.QMetaObject.connectSlotsByName(STT)

        # Connect buttons to their functions
        self.pushButton.clicked.connect(self.start_recording)
        self.pushButton_2.clicked.connect(self.stop_recording)
        self.pushButton_3.clicked.connect(self.clear_text)
        self.pushButton_small.clicked.connect(self.set_small_model)
        self.pushButton_medium.clicked.connect(self.set_medium_model)

        self.stt_worker = None
        self.model_size = "small"  # Default model size

    def retranslateUi(self, STT):
        _translate = QtCore.QCoreApplication.translate
        STT.setWindowTitle(_translate("STT", "STT"))
        self.pushButton.setText(_translate("STT", "녹음"))
        self.pushButton_2.setText(_translate("STT", "종료"))
        self.pushButton_3.setText(_translate("STT", "출력 초기화"))
        self.pushButton_small.setText(_translate("STT", "small"))
        self.pushButton_medium.setText(_translate("STT", "medium"))

    def start_recording(self):
        self.textEdit.append(f"녹음을 시작합니다... (모델: {self.model_size})")
        self.stt_worker = STTWorker(model_size=self.model_size)
        self.stt_worker.text_update.connect(self.update_text)
        self.stt_worker.stt_finished.connect(self.update_text)
        self.stt_worker.start()

    def stop_recording(self):
        if self.stt_worker is not None and self.stt_worker.isRunning():
            self.stt_worker.stop()

    def clear_text(self):
        self.textEdit.clear()

    def set_small_model(self):
        self.model_size = "small"
        self.textEdit.append("모델이 'small'로 설정되었습니다.")

    def set_medium_model(self):
        self.model_size = "medium"
        self.textEdit.append("모델이 'medium'로 설정되었습니다.")

    def update_text(self, text):
        self.textEdit.append(f"Speech: {text}")

if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    STT = QtWidgets.QWidget()
    ui = Ui_STT()
    ui.setupUi(STT)
    STT.show()
    sys.exit(app.exec_())


  from .autonotebook import tqdm as notebook_tqdm


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
