In [54]:
model = "qwen/qwen3-30b-a3b"
tailscale_server = "https://desktop-3oeimac.tail3b962f.ts.net"
chat_completion_api = tailscale_server + "/api/v0/chat/completions"
embedding_model = "text-embedding-qwen3-embedding-8b@q5_0"

In [4]:
# step 1: we look for existing transcript for the selected videos.
import glob 
from langchain_community.document_loaders import YoutubeLoader
from pytube import YouTube


media_dir = "./media/it"
loaded_videos = []
for doc in glob.glob(pathname=f"{media_dir}/*.txt", recursive=True):
    with open(doc, "r") as file:
        lines = file.readlines()
        
        
        for line in lines:
            url = line.strip()
            yt = YouTube(url)
            try:
                title = yt.title
            except Exception:
                title = None
                print(f'title not found for {url}')

            try:  
                loader = YoutubeLoader.from_youtube_url(
                    url, language=['en', 'it'], continue_on_failure=True
                )
            
                transcript = loader.load()
            except Exception:
                print(f'Error loading the transcript for {url}')
                transcript = None
            finally:
                loaded_videos.append({"url": url, "title": title, "transcript": transcript })
            


print(loaded_videos)

title not found for https://www.youtube.com/watch?v=NyjXMMBPvSA
Error loading the transcript for https://www.youtube.com/watch?v=NyjXMMBPvSA
title not found for https://www.youtube.com/watch?v=mQENVePdT5A&t=6s
Error loading the transcript for https://www.youtube.com/watch?v=mQENVePdT5A&t=6s
[{'url': 'https://www.youtube.com/watch?v=NyjXMMBPvSA', 'title': None, 'transcript': None}, {'url': 'https://www.youtube.com/watch?v=mQENVePdT5A&t=6s', 'title': None, 'transcript': None}]


In [None]:
import logging
import io
from typing import Any, Callable, Dict, Iterator, Literal, Optional, Tuple, Union, List, Iterable
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParserLocal

from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.parsers.audio import _get_audio_from_blob

from langchain_community.document_loaders.blob_loaders import Blob

from langchain_community.utils.openai import is_openai_v1

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.blob_loaders import (
    BlobLoader,
    FileSystemBlobLoader,
)

logger = logging.getLogger()
class OpenAIWhisperParserLocalCustom(OpenAIWhisperParserLocal):
    """Custom Parser for using turbo model from Whisper. Transcribe and parse audio files with OpenAI Whisper model.

    Audio transcription with OpenAI Whisper model locally from transformers.

    Parameters:
    device - device to use
        NOTE: By default uses the gpu if available,
        if you want to use cpu, please set device = "cpu"
    lang_model - whisper model to use, for example "openai/whisper-medium"
    forced_decoder_ids - id states for decoder in multilanguage model,
        usage example:
        from transformers import WhisperProcessor
        processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
        forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",
          task="transcribe")
        forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",
        task="translate")



    """

    def __init__(
        self,
        lang_model: Optional[str] = None,
        batch_size: int = 8,
        chunk_length: int = 30,
        forced_decoder_ids: Optional[Tuple[Dict]] = None,
    ):
        """Initialize the parser.

        Args:
            device: device to use.
            lang_model: whisper model to use, for example "openai/whisper-medium".
              Defaults to None.
            forced_decoder_ids: id states for decoder in a multilanguage model.
              Defaults to None.
            batch_size: batch size used for decoding
              Defaults to 8.
            chunk_length: chunk length used during inference.
              Defaults to 30s.
        """
        try:
            from transformers import pipeline
        except ImportError:
            raise ImportError(
                "transformers package not found, please install it with "
                "`pip install transformers`"
            )
        try:
            import torch
        except ImportError:
            raise ImportError(
                "torch package not found, please install it with `pip install torch`"
            )

        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        self.batch_size = batch_size
        model_id = "openai/whisper-large-v3"

        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
        )
        model.to(self.device)

        processor = AutoProcessor.from_pretrained(model_id)

        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            torch_dtype=torch_dtype,
            device=self.device,
            chunk_length_s=chunk_length        )
        
        if forced_decoder_ids is not None:
            try:
                self.pipe.model.config.forced_decoder_ids = forced_decoder_ids
            except Exception as exception_text:
                logger.info(
                    "Unable to set forced_decoder_ids parameter for whisper model"
                    f"Text of exception: {exception_text}"
                    "Therefore whisper model will use default mode for decoder"
                )



In [35]:
#step 2: we load audio and convert them if no transcript is available yet. We assume that all the urls we have need still to be processed.
from langchain_community.document_loaders import YoutubeAudioLoader
from langchain.document_loaders.generic import GenericLoader
import os

# # todo: this needs to be filled in with urls not yet processed.
urls_to_process = ["https://www.youtube.com/shorts/QtJNskFMZpY", "https://www.youtube.com/shorts/OxQzgN8i6QQ"]
save_dir = './test'

loader = GenericLoader(
        YoutubeAudioLoader(
            urls_to_process, 
            save_dir
        ),
        OpenAIWhisperParserLocalCustom())
docs = loader.load()

# we keep the folder clean to avoid the re-processing of previously downloaded videos (look at YoutubeAudioLoader lazy_load method to understand.)
for filename in os.listdir(save_dir):
   file_path = os.path.join(save_dir, filename)
   if os.path.isfile(file_path):
      os.remove(file_path)
      print(filename, "is removed")

Device set to use cpu


[youtube] Extracting URL: https://www.youtube.com/shorts/QtJNskFMZpY
[youtube] QtJNskFMZpY: Downloading webpage
[youtube] QtJNskFMZpY: Downloading tv client config
[youtube] QtJNskFMZpY: Downloading tv player API JSON
[youtube] QtJNskFMZpY: Downloading ios player API JSON
[youtube] QtJNskFMZpY: Downloading m3u8 information
[info] QtJNskFMZpY: Downloading 1 format(s): 140
[download] Destination: test\Are Push-Ups Worth Doing On Rings？.m4a
[download] 100% of  639.19KiB in 00:00:00 at 1.97MiB/s   
[FixupM4a] Correcting container of "test\Are Push-Ups Worth Doing On Rings？.m4a"
[ExtractAudio] Not converting audio test\Are Push-Ups Worth Doing On Rings？.m4a; file is already in target format m4a
[youtube] Extracting URL: https://www.youtube.com/shorts/OxQzgN8i6QQ
[youtube] OxQzgN8i6QQ: Downloading webpage
[youtube] OxQzgN8i6QQ: Downloading tv client config
[youtube] OxQzgN8i6QQ: Downloading tv player API JSON
[youtube] OxQzgN8i6QQ: Downloading ios player API JSON
[youtube] OxQzgN8i6QQ: Downl



Transcribing part test\Calisthenics Workouts Explained.m4a!
Are Push-Ups Worth Doing On Rings？.m4a is removed
Calisthenics Workouts Explained.m4a is removed


In [50]:

# step 3: split each transcription into chunks and insert them in DB
processed_transcript_files = []
for doc in docs:
    # we save the content of the transcript inside a txt file
    transcript_file_path = os.path.join(media_dir, doc.metadata.get("source").split("\\")[-1]) 
    processed_transcript_files.append(transcript_file_path)
    with open(transcript_file_path, "w") as file:
        # doc is the output of the Whisper Models
        file.write(doc.page_content)


In [None]:
# step 4: we chunck and summarize each document
import json
with open("./prompt_catalog.json") as catalog_file:
    prompt_catalog = json.load(catalog_file)
    
print(prompt_catalog.get('system'))

{'transcript_summarizer': 'You are part of a system whose focus is giving practical strategies for increasing performance in workouts relating to bodyweight and gymnastics. \n <ROLE> \n Your primary role is to assist users by summing up transcripts videos given by the user as input, focusing on finding practices which have been useful in workout plans. Summing up means that the generated summary should not be longer of the provided input. Therefore try being more concise, not too verbose and straight to the point. \n </ROLE>'}


In [55]:
def format_reasoning_model_prompt(prompt:str,no_think=True):
    if no_think:
        return prompt + " /no_think"
    return prompt

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter

file_chunks = []
for doc in docs: 
    # in the future we expect to know the context window of the model from somewhere else.
    text_splitter = TokenTextSplitter(chunk_size=3200, chunk_overlap=40)
    file_chunks.append(text_splitter.split_text(doc.page_content))

In [61]:
len(file_chunks[1])

1

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage


# Initialize the LLM pointing to your LM Studio server
llm = ChatOpenAI(
    base_url=f"{tailscale_server}/v1",  # LM Studio server URL
    api_key="lm-studio",  # Can be any string for local models
    model=model,  # Can be any string for local models
)


messages = [
    SystemMessage(json_data.get('system').get('transcript_summarizer')),
    HumanMessage(format_reasoning_model_prompt(docs[0])),
]

response = llm.invoke(messages)



TypeError: can only concatenate list (not "str") to list

In [None]:
!pip install langchain_community
!pip install
!pip install yt_dlp
!pip install librosa
!pip install pydub
!pip install pytube
!pip install ipywidgets




[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: You must give at least one requirement to install (see "pip help install")

[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pytube


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 3.0 MB/s eta 0:00:00
Installing collected packages: pytube
Successfully installed pytube-15.0.0


In [47]:
docs[0].metadata.get("source").split("\\")[-1]

'Are Push-Ups Worth Doing On Rings？.m4a'

In [45]:
docs[0].metadata.get("source").split("\\")[-1]

'Are Push-Ups Worth Doing On Rings？.m4a'