In [None]:
# importing packages
import os
import torch
import whisper
import pandas as pd
from pytubefix import YouTube
from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

In [None]:
def download_audio(youtube_url, output_path="."):
    yt = YouTube(youtube_url)
    audio_stream = yt.streams.filter(only_audio=True).first()
    audio_file = audio_stream.download(output_path=output_path)
    mp3_file = os.path.splitext(audio_file)[0] + ".mp3"
    os.rename(audio_file, mp3_file)
    return mp3_file

def format_timestamp(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}"

def transcribe_audio(audio_path):
    print("Transcribing audio...")

    model = whisper.load_model("large-v3-turbo")
    result = model.transcribe(
        audio=audio_path,
        language="nl",
        verbose=True,
        condition_on_previous_text=False
    )

    segments = result.get("segments", [])
    transcript_data = []

    for segment in segments:
        transcript_data.append({
            "Start_Time": format_timestamp(segment["start"]),
            "End_Time": format_timestamp(segment["end"]),
            "Sentence": segment["text"].strip()
        })

    df = pd.DataFrame(transcript_data)
    print("Transcription completed.")
    return df, result["language"]

def load_translation_model(model_path):
    tokenizer = MarianTokenizer.from_pretrained(model_path)
    model = MarianMTModel.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
    return tokenizer, model

def translate(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        translated_ids = model.generate(**inputs)
    return tokenizer.decode(translated_ids[0], skip_special_tokens=True)

def load_emotion_model(emotion_model_path):
    tokenizer = AutoTokenizer.from_pretrained(emotion_model_path)
    model = AutoModelForSequenceClassification.from_pretrained(emotion_model_path).to("cuda" if torch.cuda.is_available() else "cpu")
    return tokenizer, model

def classify_emotion(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()
    return model.config.id2label[predicted_class]

def save_transcription(sentences_df, translations, emotions, output_file="final_output.csv"):
    sentences_df["Translation"] = translations
    sentences_df["Emotion"] = emotions
    sentences_df.to_csv(output_file, index=False)
    return output_file

def pipeline(youtube_url, translation_model_path, emotion_model_path, output_dir="."):
    audio_file = download_audio(youtube_url, output_path=output_dir)
    transcription_df, _ = transcribe_audio(audio_file)

    # Load models
    translation_tokenizer, translation_model = load_translation_model(translation_model_path)
    emotion_tokenizer, emotion_model = load_emotion_model(emotion_model_path)

    # Translate and classify
    translations = [translate(row["Sentence"], translation_tokenizer, translation_model)
                    for _, row in transcription_df.iterrows()]
    emotions = [classify_emotion(row["Sentence"], emotion_tokenizer, emotion_model)
                for _, row in transcription_df.iterrows()]

    # Save
    output_file = os.path.join(output_dir, "final_output.csv")
    save_transcription(transcription_df, translations, emotions, output_file=output_file)

    print("Pipeline Completed Successfully!")
    print(f"Audio saved at: {audio_file}")
    print(f"CSV saved at: {output_file}")

In [None]:
youtube_url = "https://www.youtube.com/watch?v=mNOksBRpT9g"
translation_model_path = "translation_model"
emotion_model_path = "pretrained_bert_model_w_metrics"

pipeline(youtube_url, translation_model_path, emotion_model_path)