In [1]:
#%pip install --quiet --upgrade langchain chromadb pydub faster-whisper pandas

In [2]:
#%pip install --quiet --upgrade langchain-community

In [3]:
#%pip install -qU "langchain-chroma>=0.1.2"

In [4]:
from pathlib import Path
import pandas as pd
from faster_whisper import WhisperModel
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_chroma import Chroma

  import pkg_resources
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
AUDIO_DIR = Path("mp3_files")
TRANSCRIPT_CACHE = Path("transcripts")
CHROMA_DIR = Path("chroma_db")

WHISPER_MODEL_SIZE = "base"
EMBEDDING_MODEL = "nomic-embed-text"
CHAT_MODEL = "gemma3:1b"

In [6]:
audio_file = "theHealthcareFightAtTheHeartOfTheShutdown.mp3"

if Path(audio_file).exists():
  Path(audio_file).rename(AUDIO_DIR / audio_file)

In [7]:
def transcribe_audio(audio_file, whisper_model):
  segments, _ = whisper_model.transcribe(str(audio_file), beam_size=5)
  return " ".join([segment.text for segment in segments if segment.text.strip() != ""])

In [8]:
def transcibe_episode(audio_path, whisper_model):
  cache_path = TRANSCRIPT_CACHE / (audio_path.stem + ".txt")
  if cache_path.exists():
    print(f"Loading cached transcript for {audio_path.name}")
    return cache_path.read_text(encoding="utf-8")
  
  print(f"Transcribing {audio_path.name}...")
  transcript = transcribe_audio(audio_path, whisper_model)
  cache_path.write_text(transcript, encoding="utf-8")
  return transcript

In [9]:
def create_documents(df, whisper_model):
  documents = []
  for row in df.itertuples():
    audio_path = AUDIO_DIR / row["filename"]
    transcript = transcibe_episode(Path(row.filename), whisper_model)
    documents.append(Document(
      page_content=f"Episode: {row.episode_id}\n\n{transcript}",
      metadata={"episode_id": row.episode_id, "source": str(row.filepath)}
    ))
    return documents