# Cascaded SDS 시스템 테스트 입니다.
Whisper - GPT2 - googleTTS

In [None]:
!pip install torch transformers gtts sounddevice soundfile numpy pygame librosa

Collecting torch
  Downloading torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting sounddevice
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting soundfile
  Using cached soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl.metadata (14 kB)
Collecting numpy
  Downloading numpy-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pygame
  Downloading pygame-2.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting librosa
  Using cached librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jin

In [5]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from gtts import gTTS
# import sounddevice as sd
import soundfile as sf
import numpy as np
# import pygame
import time
import os
import librosa

In [7]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
3


In [18]:
class SpeechAIAssistant:
    def __init__(self, force_cpu=False):
        print("Initializing Speech AI Assistant...")

        # Set up device
        self.device = self._setup_device(force_cpu)
        print(f"Using device: {self.device}")

        # Initialize Whisper for STT
        print("Loading Whisper model...")
        self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
        self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
            "openai/whisper-base"
        )
        self.whisper_model = self.whisper_model.to(self.device)

        # Initialize GPT-2 for text generation
        print("Loading GPT-2 model...")
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
        self.gpt2_model = self.gpt2_model.to(self.device)

        # Initialize pygame for audio playback
        # pygame.mixer.init()

        # Print model memory usage if using CUDA
        if self.device.type == "cuda":
            self._print_gpu_memory_usage()

        print("Initialization complete!")

    def _setup_device(self, force_cpu):
        """Setup the device (CPU/GPU) for model inference"""
        if force_cpu:
            return torch.device("cpu")

        if torch.cuda.is_available():
            # Get the GPU with the most free memory
            gpu_id = 0
            if torch.cuda.device_count() > 1:
                free_memory = []
                for i in range(torch.cuda.device_count()):
                    torch.cuda.set_device(i)
                    torch.cuda.empty_cache()
                    free_memory.append(
                        torch.cuda.get_device_properties(i).total_memory
                        - torch.cuda.memory_allocated(i)
                    )
                gpu_id = free_memory.index(max(free_memory))

            device = torch.device(f"cuda:{gpu_id}")
            print(
                f"Found {torch.cuda.device_count()} GPU(s), using GPU {gpu_id}: {torch.cuda.get_device_name(gpu_id)}"
            )
            return device
        else:
            print("No GPU found, using CPU")
            return torch.device("cpu")

    def _print_gpu_memory_usage(self):
        """Print current GPU memory usage"""
        if self.device.type == "cuda":
            print("\nGPU Memory Usage:")
            print(
                f"Allocated: {torch.cuda.memory_allocated(self.device) / 1024**2:.2f} MB"
            )
            print(f"Cached: {torch.cuda.memory_reserved(self.device) / 1024**2:.2f} MB")

    def read_audio_file(self, file_path, target_sr=16000):
        """
        Read audio file and preprocess it for the Whisper model
        Supports various audio formats (wav, mp3, etc.)

        Args:
            file_path (str): Path to the audio file
            target_sr (int): Target sampling rate (Whisper expects 16kHz)

        Returns:
            numpy.ndarray: Processed audio array
        """
        try:
            # Load audio file and resample if necessary
            print(f"Reading audio file: {file_path}")
            audio, sr = librosa.load(file_path, sr=target_sr)

            # Convert to mono if stereo
            if len(audio.shape) > 1:
                audio = librosa.to_mono(audio)

            # Normalize audio
            audio = librosa.util.normalize(audio)

            print(
                f"Successfully loaded audio file: duration = {len(audio)/target_sr:.2f}s"
            )
            return audio

        except Exception as e:
            print(f"Error reading audio file: {str(e)}")
            raise

    def process_audio_file(self, file_path):
        """
        Process an audio file through the entire pipeline

        Args:
            file_path (str): Path to the audio file

        Returns:
            tuple: (original_text, assistant_response)
        """
        try:
            # Read and process audio file
            audio = self.read_audio_file(file_path)

            # Convert speech to text
            text = self.speech_to_text(audio)
            print(f"Transcription: [{text}]")

            # Generate response
            response = self.generate_response(text)
            print(f"Assistant: [{response}]")

            # Convert response to speech
            self.text_to_speech(response)

            return text, response

        except Exception as e:
            print(f"Error processing audio file: {str(e)}")
            raise

    # def record_audio(self, duration=5, sample_rate=16000):
    #     """Record audio from microphone"""
    #     print("Recording... Speak now!")
    #     recording = sd.rec(
    #         int(duration * sample_rate), samplerate=sample_rate, channels=1
    #     )
    #     sd.wait()
    #     print("Recording complete!")
    #     return recording

    def speech_to_text(self, audio):
        """Convert speech to text using Whisper"""
        # Convert the numpy array to the correct format and move to device
        input_features = self.whisper_processor(
            audio.squeeze(), sampling_rate=16000, return_tensors="pt"
        ).input_features

        input_features = input_features.to(self.device)

        # Generate token ids
        with torch.no_grad():
            predicted_ids = self.whisper_model.generate(input_features)

        # Decode token ids to text
        transcription = self.whisper_processor.batch_decode(
            predicted_ids, skip_special_tokens=True
        )[0]

        return transcription

    def generate_response(self, text):
        """Generate response using GPT-2"""
        # Encode the input text and move to device
        inputs = self.gpt2_tokenizer.encode(
            "me:" + text + " Response:", return_tensors="pt"
        ).to(self.device)

        # Generate response
        with torch.no_grad():
            outputs = self.gpt2_model.generate(
                inputs,
                max_length=100,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                temperature=0.5,
            )

        response = self.gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the assistant's response
        response = response.split("Response:")[-1].strip()

        return response

    def text_to_speech(self, text):
        """Convert text to speech using gTTS"""
        # Generate speech
        tts = gTTS(text=text, lang="en")

        # Save to temporary file
        temp_file = "temp_speech.mp3"
        tts.save(temp_file)

        # Play the audio
        # pygame.mixer.music.load(temp_file)
        # pygame.mixer.music.play()

        # # Wait for audio to finish
        # while pygame.mixer.music.get_busy():
        #     time.sleep(0.1)

        # # Clean up
        # pygame.mixer.music.unload()
        # os.remove(temp_file)

    def process_single_interaction(self):
        """Process a single interaction with the assistant"""
        # Record audio
        audio = self.record_audio()

        # Convert speech to text
        text = self.speech_to_text(audio)
        print(f"You said: {text}")

        # Generate response
        response = self.generate_response(text)
        print(f"Assistant: {response}")

        # Convert response to speech
        self.text_to_speech(response)

        return text, response

    def start_conversation(self, num_turns=3):
        """Start a conversation with the specified number of turns"""
        print("Starting conversation...")
        for i in range(num_turns):
            print(f"\nTurn {i+1}/{num_turns}")
            self.process_single_interaction()
        print("\nConversation ended.")

In [19]:
# Example usage
# Create assistant instance with optional GPU selection
assistant = SpeechAIAssistant(force_cpu=False)  # Set to True to force CPU usage

try:
    # Optional: Monitor GPU memory before processing
    if assistant.device.type == "cuda":
        assistant._print_gpu_memory_usage()

    # Test with an audio file
    print("\nTesting with audio file...")
    test_file = "assets/00006.wav"  # Replace with your audio file path
    text, response = assistant.process_audio_file(test_file)
    assistant.text_to_speech(response)

    # Optional: Monitor GPU memory after processing
    if assistant.device.type == "cuda":
        assistant._print_gpu_memory_usage()

    # print("\nStarting live conversation...")
    # # Start conversation with 3 turns
    # assistant.start_conversation(num_turns=3)
except KeyboardInterrupt:
    print("\nConversation interrupted by user.")
except Exception as e:
    print(f"An error occurred: {str(e)}")
finally:
    # Clean up GPU memory
    if assistant.device.type == "cuda":
        torch.cuda.empty_cache()

Initializing Speech AI Assistant...
Found 3 GPU(s), using GPU 0: NVIDIA A100 80GB PCIe
Using device: cuda:0
Loading Whisper model...
Loading GPT-2 model...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



GPU Memory Usage:
Allocated: 774.22 MB
Cached: 814.00 MB
Initialization complete!

GPU Memory Usage:
Allocated: 774.22 MB
Cached: 814.00 MB

Testing with audio file...
Reading audio file: assets/00006.wav
Successfully loaded audio file: duration = 4.96s
Transcription: [ It's also, I mean that helps a lot with the scenes because you're very much alive]
Assistant: [I think it's a little bit of a relief to have that. I'm not sure if it helps you in the sense that you don't have to be a hero to get what you want. But I do think that it makes you feel more alive. And I don' think you have a problem with that, because it doesn't feel like you've been killed. It feels]

GPU Memory Usage:
Allocated: 774.22 MB
Cached: 878.00 MB
