In [29]:
model = "qwen/qwen3-30b-a3b"
tailscale_server = "https://desktop-3oeimac.tail3b962f.ts.net"
chat_completion_api = tailscale_server + "/api/v0/chat/completions"
embedding_model = "text-embedding-qwen3-embedding-8b@q5_0"

In [4]:
# step 1: we look for existing transcript for the selected videos.
import glob 
from langchain_community.document_loaders import YoutubeLoader
from pytube import YouTube


media_dir = "./media/it"
loaded_videos = []
for doc in glob.glob(pathname=f"{media_dir}/*.txt", recursive=True):
    with open(doc, "r") as file:
        lines = file.readlines()
        
        
        for line in lines:
            url = line.strip()
            yt = YouTube(url)
            try:
                title = yt.title
            except Exception:
                title = None
                print(f'title not found for {url}')

            try:  
                loader = YoutubeLoader.from_youtube_url(
                    url, language=['en', 'it'], continue_on_failure=True
                )
            
                transcript = loader.load()
            except Exception:
                print(f'Error loading the transcript for {url}')
                transcript = None
            finally:
                loaded_videos.append({"url": url, "title": title, "transcript": transcript })
            


print(loaded_videos)

title not found for https://www.youtube.com/watch?v=NyjXMMBPvSA
Error loading the transcript for https://www.youtube.com/watch?v=NyjXMMBPvSA
title not found for https://www.youtube.com/watch?v=mQENVePdT5A&t=6s
Error loading the transcript for https://www.youtube.com/watch?v=mQENVePdT5A&t=6s
[{'url': 'https://www.youtube.com/watch?v=NyjXMMBPvSA', 'title': None, 'transcript': None}, {'url': 'https://www.youtube.com/watch?v=mQENVePdT5A&t=6s', 'title': None, 'transcript': None}]


In [None]:
import logging
import io
from typing import Any, Callable, Dict, Iterator, Literal, Optional, Tuple, Union, List, Iterable
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParserLocal

from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.parsers.audio import _get_audio_from_blob

from langchain_community.document_loaders.blob_loaders import Blob

from langchain_community.utils.openai import is_openai_v1

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.blob_loaders import (
    BlobLoader,
    FileSystemBlobLoader,
)

logger = logging.getLogger()
class OpenAIWhisperParserLocalCustom(OpenAIWhisperParserLocal):
    """Custom Parser for using turbo model from Whisper. Transcribe and parse audio files with OpenAI Whisper model.

    Audio transcription with OpenAI Whisper model locally from transformers.

    Parameters:
    device - device to use
        NOTE: By default uses the gpu if available,
        if you want to use cpu, please set device = "cpu"
    lang_model - whisper model to use, for example "openai/whisper-medium"
    forced_decoder_ids - id states for decoder in multilanguage model,
        usage example:
        from transformers import WhisperProcessor
        processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
        forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",
          task="transcribe")
        forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",
        task="translate")



    """

    def __init__(
        self,
        lang_model: Optional[str] = None,
        batch_size: int = 8,
        chunk_length: int = 30,
        forced_decoder_ids: Optional[Tuple[Dict]] = None,
    ):
        """Initialize the parser.

        Args:
            device: device to use.
            lang_model: whisper model to use, for example "openai/whisper-medium".
              Defaults to None.
            forced_decoder_ids: id states for decoder in a multilanguage model.
              Defaults to None.
            batch_size: batch size used for decoding
              Defaults to 8.
            chunk_length: chunk length used during inference.
              Defaults to 30s.
        """
        try:
            from transformers import pipeline
        except ImportError:
            raise ImportError(
                "transformers package not found, please install it with "
                "`pip install transformers`"
            )
        try:
            import torch
        except ImportError:
            raise ImportError(
                "torch package not found, please install it with `pip install torch`"
            )

        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        self.batch_size = batch_size
        model_id = "openai/whisper-large-v3"

        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
        )
        model.to(self.device)

        processor = AutoProcessor.from_pretrained(model_id)

        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            torch_dtype=torch_dtype,
            device=self.device,
            chunk_length_s=chunk_length        )
        
        if forced_decoder_ids is not None:
            try:
                self.pipe.model.config.forced_decoder_ids = forced_decoder_ids
            except Exception as exception_text:
                logger.info(
                    "Unable to set forced_decoder_ids parameter for whisper model"
                    f"Text of exception: {exception_text}"
                    "Therefore whisper model will use default mode for decoder"
                )



In [35]:
#step 2: we load audio and convert them if no transcript is available yet. We assume that all the urls we have need still to be processed.
from langchain_community.document_loaders import YoutubeAudioLoader
from langchain.document_loaders.generic import GenericLoader
import os

# # todo: this needs to be filled in with urls not yet processed.
urls_to_process = ["https://www.youtube.com/shorts/QtJNskFMZpY", "https://www.youtube.com/shorts/OxQzgN8i6QQ"]
save_dir = './test'

loader = GenericLoader(
        YoutubeAudioLoader(
            urls_to_process, 
            save_dir
        ),
        OpenAIWhisperParserLocalCustom())
docs = loader.load()

# we keep the folder clean to avoid the re-processing of previously downloaded videos (look at YoutubeAudioLoader lazy_load method to understand.)
for filename in os.listdir(save_dir):
   file_path = os.path.join(save_dir, filename)
   if os.path.isfile(file_path):
      os.remove(file_path)
      print(filename, "is removed")

Device set to use cpu


[youtube] Extracting URL: https://www.youtube.com/shorts/QtJNskFMZpY
[youtube] QtJNskFMZpY: Downloading webpage
[youtube] QtJNskFMZpY: Downloading tv client config
[youtube] QtJNskFMZpY: Downloading tv player API JSON
[youtube] QtJNskFMZpY: Downloading ios player API JSON
[youtube] QtJNskFMZpY: Downloading m3u8 information
[info] QtJNskFMZpY: Downloading 1 format(s): 140
[download] Destination: test\Are Push-Ups Worth Doing On Rings？.m4a
[download] 100% of  639.19KiB in 00:00:00 at 1.97MiB/s   
[FixupM4a] Correcting container of "test\Are Push-Ups Worth Doing On Rings？.m4a"
[ExtractAudio] Not converting audio test\Are Push-Ups Worth Doing On Rings？.m4a; file is already in target format m4a
[youtube] Extracting URL: https://www.youtube.com/shorts/OxQzgN8i6QQ
[youtube] OxQzgN8i6QQ: Downloading webpage
[youtube] OxQzgN8i6QQ: Downloading tv client config
[youtube] OxQzgN8i6QQ: Downloading tv player API JSON
[youtube] OxQzgN8i6QQ: Downloading ios player API JSON
[youtube] OxQzgN8i6QQ: Downl



Transcribing part test\Calisthenics Workouts Explained.m4a!
Are Push-Ups Worth Doing On Rings？.m4a is removed
Calisthenics Workouts Explained.m4a is removed


In [None]:

# step 3: split each transcription into chunks and insert them in DB
processed_transcript_files = []
for doc in docs:
    # we save the content of the transcript inside a txt file
    transcript_file_path = doc.metadata.get("source").split("\\")[-1]
    processed_transcript_files.append(transcript_file_path)
    with open(os.path.join(media_dir, transcript_file_path), "w") as file:
        # doc is the output of the Whisper Models
        file.write(doc.page_content)


In [63]:
# step 4: we chunck and summarize each document
import json
with open("./prompt_catalog.json") as catalog_file:
    prompt_catalog = json.load(catalog_file)
    
print(prompt_catalog.get('system'))

{'transcript_summarizer': 'You are part of a system whose focus is giving practical strategies for increasing performance in workouts relating to bodyweight and gymnastics. \n <ROLE> \n Your primary role is to assist users by summing up transcripts videos given by the user as input, focusing on finding practices which have been useful in workout plans. Summing up means that the generated summary should not be longer of the provided input. Therefore try being more concise, not too verbose and straight to the point. \n </ROLE>'}


In [94]:

import hashlib
import re

def remove_think_tags(text):
    """
    Remove <think>...</think> blocks from text using regex.
    Handles multiline content and whitespace.
    """
    # Pattern explanation:
    # <think>     - matches opening tag
    # .*?         - matches any characters (non-greedy)
    # </think>    - matches closing tag
    # re.DOTALL   - makes . match newlines too
    pattern = r'<think>.*?</think>'
    cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
    return cleaned_text.strip()

def format_reasoning_model_prompt(prompt:str,no_think=True):
    if no_think:
        return prompt + " /no_think"
    return prompt

def create_id_from_string(input_string):
    # Use SHA-256 for strong hashing
    hash_object = hashlib.sha256(input_string.encode())
    # Get the hex digest (a string representation)
    return hash_object.hexdigest()


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter

file_chunks_dict = {}
for file_path, doc, url in zip(transcript_file_path,docs, urls_to_process):
    # in the future we expect to know the context window of the model from somewhere else.
    text_splitter = TokenTextSplitter(chunk_size=3200, chunk_overlap=40)
    file_chunks_dict[create_id_from_string(file_path)] = {"chunks" : (text_splitter.split_text(doc.page_content)), "url": url}
    

In [89]:
file_chunks_dict

{'6b23c0d5f35d1b11f9b683f0b0a617355deb11277d91ae091d399c655b87940d': {'chunks': [" Ring push-ups are better than floor push-ups for a few reasons. The increased range of motion is where the magic lies. The latest science on bodybuilding shows that an emphasis on the stretch is really important for getting best results. Rings apply the science in practice. Your chest is going to get really sore after a workout, which is a good proxy for growth. Once again, rings feel better on your body. Because of that freedom of movement, you can use a technique that suits your body type, including your flexibility, your limb lengths, your strengths. It's all going to change the way your push-up looks. Contrast this with floor push-ups where your hands are fixed and your body's forced into this rigid technique. No one tells you how to exercise, not even the floor."]},
 'ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb': {'chunks': [" The most popular workout splits for calisthenics inc

In [95]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage


# Initialize the LLM pointing to your LM Studio server
llm = ChatOpenAI(
    base_url=f"{tailscale_server}/v1",  # LM Studio server URL
    api_key="lm-studio",  # Can be any string for local models
    model=model,  # Can be any string for local models
)


for k, v in file_chunks_dict.items():
    v['summaries'] = []
    for file_chunk in v['chunks']:
        messages = [
            SystemMessage(prompt_catalog.get('system').get('transcript_summarizer')),
            HumanMessage(format_reasoning_model_prompt(file_chunk)),
        ]

        response = llm.invoke(messages)
        v['summaries'].append(remove_think_tags(response.content))




In [96]:
file_chunks_dict

{'6b23c0d5f35d1b11f9b683f0b0a617355deb11277d91ae091d399c655b87940d': {'chunks': [" Ring push-ups are better than floor push-ups for a few reasons. The increased range of motion is where the magic lies. The latest science on bodybuilding shows that an emphasis on the stretch is really important for getting best results. Rings apply the science in practice. Your chest is going to get really sore after a workout, which is a good proxy for growth. Once again, rings feel better on your body. Because of that freedom of movement, you can use a technique that suits your body type, including your flexibility, your limb lengths, your strengths. It's all going to change the way your push-up looks. Contrast this with floor push-ups where your hands are fixed and your body's forced into this rigid technique. No one tells you how to exercise, not even the floor."],
  'summaries': ['Ring push-ups offer a greater range of motion, enhancing muscle stretch for better growth. They allow personalized move

In [None]:
# Step 5: we embed each chunk of the documents.
for k, v in file_chunks_dict.items():
    v['embedding'] = []
    for summary in v['summaries']:

        embedder_model = CustomEmbeddingModel("text-embedding-nomic-embed-text-v1.5")
        embedded_text = embedder_model.embed_query(summary)
        v['embedding'].append(embedded_text)

https://desktop-3oeimac.tail3b962f.ts.net/v1
{'model': 'text-embedding-nomic-embed-text-v1.5', 'input': ['Ring push-ups offer a greater range of motion, enhancing muscle stretch for better growth. They allow personalized movement based on individual flexibility and body type, unlike fixed-floor push-ups. Rings also make workouts more comfortable and effective.']}
https://desktop-3oeimac.tail3b962f.ts.net/v1
{'model': 'text-embedding-nomic-embed-text-v1.5', 'input': ['Popular calisthenics workout splits include:\n\n- **Full Body**: Works all muscles in each session.\n- **Upper/Lower**: Separates upper body and leg days.\n- **Bent Arm/Straight Arm**: Focuses on scapular strength for front levers, planches.\n- **Push/Pull/Legs**: Classic split for upper body emphasis.\n\nWork out 3–5 days/week with 24–48 hours rest between same-muscle sessions. Structure is key for progressive overload and injury prevention.']}


In [102]:
file_chunks_dict

{'6b23c0d5f35d1b11f9b683f0b0a617355deb11277d91ae091d399c655b87940d': {'chunks': [" Ring push-ups are better than floor push-ups for a few reasons. The increased range of motion is where the magic lies. The latest science on bodybuilding shows that an emphasis on the stretch is really important for getting best results. Rings apply the science in practice. Your chest is going to get really sore after a workout, which is a good proxy for growth. Once again, rings feel better on your body. Because of that freedom of movement, you can use a technique that suits your body type, including your flexibility, your limb lengths, your strengths. It's all going to change the way your push-up looks. Contrast this with floor push-ups where your hands are fixed and your body's forced into this rigid technique. No one tells you how to exercise, not even the floor."],
  'summaries': ['Ring push-ups offer a greater range of motion, enhancing muscle stretch for better growth. They allow personalized move

In [3]:
from pymilvus import Collection, MilvusException, connections, db, utility

conn = connections.connect(host="127.0.0.1", port=19530)

# Check if the database exists
db_name = "milvus_demo"
try:
    existing_databases = db.list_database()
    if db_name in existing_databases:
        print(f"Database '{db_name}' already exists.")

        # Use the database context
        db.using_database(db_name)

        # Drop all collections in the database
        collections = utility.list_collections()
        for collection_name in collections:
            collection = Collection(name=collection_name)
            collection.drop()
            print(f"Collection '{collection_name}' has been dropped.")

        db.drop_database(db_name)
        print(f"Database '{db_name}' has been deleted.")
    else:
        print(f"Database '{db_name}' does not exist.")
        database = db.create_database(db_name)
        print(f"Database '{db_name}' created successfully.")
except MilvusException as e:
    print(f"An error occurred: {e}")

Database 'milvus_demo' already exists.
Database 'milvus_demo' has been deleted.


In [69]:
first_item = json_data.popitem()


In [70]:
first_key, first_value = first_item


In [71]:
# dense embeddings.

import requests
import logging

from langchain_core.embeddings import Embeddings
from typing import List, Optional

class CustomEmbeddingModel(Embeddings):
    def __init__(self, 
                 model_name: str,
                 base_url: str = f"{tailscale_server}/v1",
                 api_key: str = "lm-studio",
                 timeout: int = 30):
        self.model_name = model_name
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        self.timeout = timeout
        print(self.base_url)
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        
        # Test connection
        self._test_connection()
    
    def _test_connection(self):
        """Test if LM Studio is accessible."""
        try:
            response = requests.get(f"{self.base_url}/models", 
                                  headers=self.headers, 
                                  timeout=5)
            response.raise_for_status()
            logging.info("Successfully connected to LM Studio")
        except Exception as e:
            logging.warning(f"Could not connect to LM Studio: {e}")
    
    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        url = f"{self.base_url}/embeddings"
        
        payload = {
            "model": self.model_name,
            "input": texts
        }
        
        try:
            print(payload)
            response = requests.post(url, 
                                   headers=self.headers, 
                                   json=payload,
                                   timeout=self.timeout)
            response.raise_for_status()
            
            data = response.json()
            return [item["embedding"] for item in data["data"]]
            
        except requests.exceptions.RequestException as e:
            raise Exception(f"LM Studio API error: {e}")
        except KeyError as e:
            raise Exception(f"Unexpected API response format: {e}")
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # Process in batches if needed
        batch_size = 10  # Adjust based on your needs
        all_embeddings = []
        
        all_embeddings = self._get_text_embeddings(texts)
        # for i in range(0, len(texts), batch_size):
        #     batch = texts[i:i + batch_size]
        #     batch_embeddings = self._get_embeddings(batch)
        #     all_embeddings.extend(batch_embeddings)
        
        return all_embeddings
    
    def embed_query(self, text: str) -> List[float]:
        embeddings = self._get_text_embeddings([text])
        return embeddings[0]
    
# class CustomEmbeddingFunction(EmbeddingFunction):
    
#     def __init__(self, embedding_model: CustomEmbeddingModel):
#         super().__init__()
#         self.embedding_model = embedding_model
        
#     def __call__(self, input: list[str]) -> Embeddings:
#         # embed the documents somehow
#         embeddings_list = []
#         for doc in input:
#             embeddings_list.append(self.embedding_model.embed_query(doc))
#         return embeddings_list
    
    
    
dense_embedder_model = CustomEmbeddingModel("text-embedding-nomic-embed-text-v1.5")

dense_embedded_text = dense_embedder_model.embed_query(first_value['summaries'][0])


https://desktop-3oeimac.tail3b962f.ts.net/v1
{'model': 'text-embedding-nomic-embed-text-v1.5', 'input': ['Popular calisthenics workout splits include:\n\n- **Full Body**: Works all muscles in each session.\n- **Upper/Lower**: Separates upper body and leg days.\n- **Bent Arm/Straight Arm**: Focuses on scapular strength for front levers, planches.\n- **Push/Pull/Legs**: Classic split for upper body emphasis.\n\nWork out 3–5 days/week with 24–48 hours rest between same-muscle sessions. Structure is key for progressive overload and injury prevention.']}


In [72]:
len(dense_embedded_text)

768

In [None]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)
from pymilvus.model.hybrid import BGEM3EmbeddingFunction


# Connect to Milvus given URI
conn = connections.connect(host="127.0.0.1", port=19530)


ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
dense_dim = ef.dim["dense"]

# Generate embeddings using BGE-M3 model
docs_embeddings = ef(first_value['chunks'])

# Specify the data schema for the new Collection
fields = [
    # Use auto generated id as primary key
    FieldSchema(
        name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100
    ),
    # Store the original text to retrieve based on semantically distance
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="hashed_title", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=512),
    # Milvus now supports both sparse and dense vectors,
    # we can store each in a separate field to conduct hybrid search on both vectors
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=len(dense_embedded_text)),
]
schema = CollectionSchema(fields)

# Create collection (drop the old one if exists)
col_name = "hybrid_demo"
if utility.has_collection(col_name):
    Collection(col_name).drop()
col = Collection(col_name, schema, consistency_level="Strong")

# To make vector search efficient, we need to create indices for the vector fields
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
col.create_index("sparse_vector", sparse_index)
dense_index = {"index_type": "AUTOINDEX", "metric_type": "IP"}
col.create_index("dense_vector", dense_index)
col.load()

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
for k,v in json_data.items():
    for el in v['summaries']:
        sparse_docs_embedding = ef([el])    
        dense_docs_embedding = dense_embedder_model.embed_query(el)
        col.insert({"text": el, "hashed_title": k, "sparse_vector": sparse_docs_embedding.get('sparse'), "dense_vector": dense_docs_embedding, "url": v.get('url',None)})


{'model': 'text-embedding-nomic-embed-text-v1.5', 'input': ['Ring push-ups offer a greater range of motion, enhancing muscle stretch for better growth. They allow personalized movement based on individual flexibility and body type, unlike fixed-floor push-ups. Rings also make workouts more comfortable and effective.']}
{'model': 'text-embedding-nomic-embed-text-v1.5', 'input': ['Popular calisthenics workout splits include:\n\n- **Full Body**: Works all muscles in each session.\n- **Upper/Lower**: Separates upper body and leg days.\n- **Bent Arm/Straight Arm**: Focuses on scapular strength for front levers, planches.\n- **Push/Pull/Legs**: Classic split for upper body emphasis.\n\nWork out 3–5 days/week with 24–48 hours rest between same-muscle sessions. Structure is key for progressive overload and injury prevention.']}


In [123]:
from pymilvus import AnnSearchRequest

query = "best way to train the chest"


search_param_1 = {
    "data": ef([query])['sparse'], # Query vector
    "anns_field": "sparse_vector", # Vector field name
    "param": {
        "metric_type": "IP", # This parameter value must be identical to the one used in the collection schema
        "params": {"nprobe": 10}
    },
    "limit": 2 # Number of search results to return in this AnnSearchRequest
}
request_1 = AnnSearchRequest(**search_param_1)


search_param_2 = {
    "data": [embedder_model.embed_query(query)], # Query vector
    "anns_field": "dense_vector", # Vector field name
    "param": {
        "metric_type": "IP", # This parameter value must be identical to the one used in the collection schema
        "params": {"nprobe": 10}
    },
    "limit": 2 # Number of search results to return in this AnnSearchRequest
}
request_2 = AnnSearchRequest(**search_param_2)
reqs = [request_1, request_2]

{'model': 'text-embedding-nomic-embed-text-v1.5', 'input': ['best way to train the chest']}


In [124]:
from pymilvus import RRFRanker

rerank = RRFRanker()

In [125]:
# Before conducting hybrid search, load the collection into memory.
col.load()

res = col.hybrid_search(
    reqs, # List of AnnSearchRequests created in step 1
    rerank, # Reranking strategy specified in step 2
    limit=2 # Number of final search results to return
)

print(res)


data: [[{'pk': '459267883574287319', 'distance': 0.016393441706895828, 'entity': {}}, {'pk': '459267883574287317', 'distance': 0.016129031777381897, 'entity': {}}]]


None


In [136]:
from pymilvus import MilvusClient
client = MilvusClient(
    uri="http://localhost:19530",
    token="root:Milvus"
)

search_res = client.search(
    collection_name=col,
    data=[
        ef([query])['sparse']
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)

2025-07-09 10:52:25,799 [ERROR][handler]: Unexpected error: [search], bad argument type for built-in operation, <Time: {'RPC start': '2025-07-09 10:52:25.798802', 'Exception': '2025-07-09 10:52:25.799779'}> (decorators.py:158)
2025-07-09 10:52:25,801 [ERROR][search]: Failed to search collection: <Collection>:
-------------
<name>: hybrid_demo
<description>: 
<schema>: {'auto_id': True, 'description': '', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}}, {'name': 'hashed_title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}}, {'name': 'url', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}}, {'name': 'sparse_vector', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}, {'name': 'dense_vector', 'description': '', 'typ

MilvusException: <MilvusException: (code=1, message=Unexpected error, message=<bad argument type for built-in operation>)>

In [132]:
embedder_model.embed_query(query)

{'model': 'text-embedding-nomic-embed-text-v1.5', 'input': ['best way to train the chest']}


[-0.007403921335935593,
 0.06478507071733475,
 -0.17964208126068115,
 0.006472209934145212,
 0.012696708552539349,
 0.014536282047629356,
 0.05450385808944702,
 0.006805232260376215,
 -0.011155310086905956,
 -0.010250093415379524,
 -0.05575506016612053,
 0.012975500896573067,
 0.05085764452815056,
 -0.02891538478434086,
 0.01724827289581299,
 -0.029981272295117378,
 0.038042981177568436,
 -0.07529429346323013,
 -0.0020745955407619476,
 -0.04420102387666702,
 -0.0003279711527284235,
 -0.011187771335244179,
 -0.0014194258255884051,
 -0.003572396468371153,
 0.03945126011967659,
 -0.06282556056976318,
 0.01200133003294468,
 0.023427199572324753,
 -0.020257074385881424,
 0.05677046999335289,
 0.03473464399576187,
 -0.06915283203125,
 0.015543552115559578,
 -0.02046482264995575,
 0.0035623067524284124,
 -0.017131373286247253,
 0.005112999118864536,
 0.022169901058077812,
 0.009738877415657043,
 -0.03499673679471016,
 0.040328674018383026,
 -0.07368399202823639,
 0.01966382935643196,
 -0.0201

In [74]:
import json 

with open("results.json") as file: 
    json_data = json.load(file)


In [1]:
!pip install langchain_community
!pip install
!pip install yt_dlp
!pip install librosa
!pip install pydub
!pip install pytube
!pip install ipywidgets
!pip install langchain_milvus
!pip install pymilvus
!pip install pymilvus[model]
!pip install torch
!pip install FlagEmbedding

Collecting langchain_community
  Using cached langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain_community)
  Using cached langchain_core-0.3.68-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain_community)
  Using cached langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain_community)
  Downloading sqlalchemy-2.0.41-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting requests<3,>=2 (from langchain_community)
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting PyYAML>=5.3 (from langchain_community)
  Downloading PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Downloading aiohttp-3.12.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting tenac

In [10]:
docs_embeddings = ef.encode_documents(first_value['chunks'])

In [47]:
docs[0].metadata.get("source").split("\\")[-1]

'Are Push-Ups Worth Doing On Rings？.m4a'

In [45]:
docs[0].metadata.get("source").split("\\")[-1]

'Are Push-Ups Worth Doing On Rings？.m4a'

In [114]:
import json 
with open('results.json', "w") as f:
        json.dump(file_chunks_dict, f)
