In [13]:
# importing packages
import os
import re
import json
import torch
import whisper
import requests
import pandas as pd
from pytubefix import YouTube
from transformers import MarianTokenizer, MarianMTModel

In [14]:
API_BASE = "http://194.171.191.227:30080"
TOKEN = 'sk-d0475141e84f4200b3699b5acc27c11f'
EMOTIONS = ["sadness", "happiness", "anger", "fear", "disgust", "surprise", "neutral"]
SYSTEM_PROMPT = f'''You are an expert in inferring emotions from text. 
You will receive a string that contains a sentence as an input and you will have to predict the emotion behind that sentence.
You will return one emotion as a string, for that sentence. 
The emotions must only be one of the following: {EMOTIONS}.
If the emotion is not clear or there is some doubt, return neutral. Please only return an emotion I do not need any other context, write the full emotion name, 
for example sadness, happiness, anger, fear, disgust, surprise or neutral. The output must be a single word. Fully written out, I do NOT need anything else.
'''

def download_audio(youtube_url, output_path="."):
    yt = YouTube(youtube_url)
    audio_stream = yt.streams.filter(only_audio=True).first()
    audio_file = audio_stream.download(output_path=output_path)
    mp3_file = os.path.splitext(audio_file)[0] + ".mp3"
    os.rename(audio_file, mp3_file)
    return mp3_file

def transcribe_audio(file_path, transcript_path="transcription.json"):
    if os.path.exists(transcript_path):
        print("Loading existing transcription...")
        with open(transcript_path, "r") as f:
            return json.load(f)
    
    print("Transcribing audio...")
    model = whisper.load_model("large-v2")
    result = model.transcribe(file_path, word_timestamps=True)
    
    with open(transcript_path, "w") as f:
        json.dump(result, f)
    
    return result

def split_sentences_with_timestamps(transcription):
    sentences = []
    for segment in transcription['segments']:
        text = segment['text'].strip()
        start_time = segment['start']
        end_time = segment['end']
        sentences.append((start_time, end_time, text))
    return sentences

def load_translation_model(model_path):
    tokenizer = MarianTokenizer.from_pretrained(model_path)
    model = MarianMTModel.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
    return tokenizer, model

def translate(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        translated_ids = model.generate(**inputs)
    return tokenizer.decode(translated_ids[0], skip_special_tokens=True)

def classify_emotion(sentence):
    url = f'{API_BASE}/api/chat/completions'
    headers = {'Authorization': f'Bearer {TOKEN}', 'Content-Type': 'application/json'}
    prompt = [{"role": "system", "content": SYSTEM_PROMPT},
              {"role": "user", "content": sentence}]
    
    response = requests.post(url, headers=headers, json={"model": "llama3.3:latest", "messages": prompt})
    if response.status_code == 200:
        return response.json().get('choices', [{}])[0].get('message', {}).get('content', 'neutral')
    return 'neutral'

def save_transcription(sentences, translations, emotions, output_file="final_output.csv"):
    df = pd.DataFrame(sentences, columns=["Start_Time", "End_Time", "Sentence"])
    df["Translation"] = translations
    df["Emotion"] = emotions
    df.to_csv(output_file, index=False)
    return output_file

def pipeline(youtube_url, model_path):
    audio_file = download_audio(youtube_url)
    transcription = transcribe_audio(audio_file)
    sentences = split_sentences_with_timestamps(transcription)
    
    tokenizer, model = load_translation_model(model_path)
    translations = [translate(text, tokenizer, model) for _, _, text in sentences]
    emotions = [classify_emotion(text) for _, _, text in sentences]
    
    save_transcription(sentences, translations, emotions)
    print("Pipeline Completed Successfully!")

In [15]:
youtube_url = "https://www.youtube.com/watch?v=mNOksBRpT9g"
model_path = "translation_model"

pipeline(youtube_url, model_path)

Transcribing audio...


  checkpoint = torch.load(fp, map_location=device)


Pipeline Completed Successfully!


In [16]:
import os

model_path = "translation_model"  # or provide the absolute path if necessary
print("Model path exists:", os.path.exists(model_path))
print("Contents of model path:", os.listdir(model_path) if os.path.exists(model_path) else "Path not found")


Model path exists: True
Contents of model path: ['config.json', 'generation_config.json', 'model.safetensors', 'optimizer.pt', 'rng_state.pth', 'scheduler.pt', 'source.spm', 'special_tokens_map.json', 'target.spm', 'tokenizer_config.json', 'trainer_state.json', 'training_args.bin', 'vocab.json']
