In [None]:
import os
from pydub import AudioSegment
from pyannote.audio import Pipeline, Audio as PyannoteAudio
import whisper
from pyannote.core import Segment
import gradio as gr
from turkish_lm_tuner import TextPredictor
import torchaudio 
import torch
import numpy as np
import soundfile as sf

# Function to save audio array to a temporary WAV file
def save_audio_to_wav(audio, sample_rate):
    audio_file_path = "temp_audio.wav"
    # Write the audio data to a WAV file
    sf.write(audio_file_path, audio, sample_rate)
    return audio_file_path

# Function to convert to WAV format if necessary
def convert_to_wav(audio_file_path):
    # Split the file path into the file name and extension
    file_name, file_extension = os.path.splitext(audio_file_path)
    # Check if the file is not already in WAV format
    if file_extension.lower() != ".wav":
        # Load the audio file using pydub
        audio = AudioSegment.from_file(audio_file_path)
        # Define the new file path with WAV extension
        wav_file_path = f"{file_name}.wav"
        # Export the audio file as WAV
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    return audio_file_path

# Function to convert stereo to mono
def stereo_to_mono(waveform):
    # Check if the audio is stereo (2 channels)
    if waveform.ndim == 2 and waveform.shape[0] == 2:
        # Average the two channels to create a mono signal
        waveform = np.mean(waveform, axis=0)
    return waveform

# Load sentiment and topic predictors
# Initialize the sentiment predictor with the specified model
sentiment_predictor = TextPredictor(model_name='boun-tabi-LMG/turna_classification_17bintweet_sentiment', task='sentiment')
# Initialize the topic predictor with the specified model
topic_predictor = TextPredictor(model_name='boun-tabi-LMG/turna_classification_ttc4900', task='categorization')

# Function to analyze text and return sentiment and topic
def analyze_text(text):
    # Predict sentiment of the text
    sentiment = sentiment_predictor.predict(text)
    # Predict topic of the text
    topic = topic_predictor.predict(text)
    return sentiment, topic

# Function to process audio and return transcriptions
def process_audio(audio_file_path):
    # Ensure the audio file is in WAV format
    audio_file_path = convert_to_wav(audio_file_path)

    # Check if the file exists
    if not os.path.exists(audio_file_path):
        raise RuntimeError(f"File does not exist: {audio_file_path}")

    # Load pyannote.audio speaker diarization pipeline
    speaker_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="YOUR_AUTH_TOKEN")
    # Perform speaker diarization on the audio file to determine who speaks when
    who_speaks_when = speaker_diarization(audio_file_path, num_speakers=3)

    # Load Whisper model for transcription
    model = whisper.load_model("large-v3", device="cuda" if torch.cuda.is_available() else "cpu")

    # Initialize lists to store transcriptions and segments for each speaker
    transcriptions = []
    speaker_segments = {}
    # Initialize pyannote Audio object
    audio = PyannoteAudio(sample_rate=16000, mono=True)

    # Iterate over each segment with speaker labels
    for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
        # Extract the waveform for the current segment
        waveform, sample_rate = audio.crop(audio_file_path, segment)
        waveform = waveform.squeeze().numpy()  # Convert to numpy array
        waveform = stereo_to_mono(waveform)  # Convert to mono

        # Transcribe the processed waveform using Whisper
        text = model.transcribe(waveform, language="tr")["text"]
        # Add the transcription to the list
        transcriptions.append(f"{segment.start:06.1f}s - {segment.end:06.1f}s - {speaker}: {text}")
        # Append the text to the corresponding speaker's combined text
        if speaker not in speaker_segments:
            speaker_segments[speaker] = ""
        speaker_segments[speaker] += text + " "

    # Analyze sentiment and topic for each speaker's combined text
    for speaker, combined_text in speaker_segments.items():
        sentiment, topic = analyze_text(combined_text)
        transcriptions.append(f"Speaker {speaker} sentiment: {sentiment[0]}, topic: {topic[0]}")

    return "\n".join(transcriptions)

# Function to handle the audio input and return the transcription result
def transcribe(audio):
    if audio is None:
        return "No audio input received."

    sample_rate, audio_data = audio
    # Convert audio data to float32 format
    audio_data = audio_data.astype(np.float32)
    # Normalize the audio data
    audio_data /= np.max(np.abs(audio_data))

    # Save the audio data to a WAV file
    audio_file_path = save_audio_to_wav(audio_data, sample_rate)

    transcription = process_audio(audio_file_path)  

    return transcription

# Create the Gradio interface
iface = gr.Interface(
    fn=transcribe,  # Function to call when the audio input is provided
    inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"), 
    outputs=gr.Textbox(),  
    title="Speaker Diarization, Transcription, Sentiment, and Topic Analysis"
)

# Launch the Gradio interface
iface.launch()
