# 1. Setup Your Google Colab Environment

In [49]:
# Core RAG libraries
!pip install langchain pypdf unstructured chromadb sentence-transformers

# For Image processing
!pip install Pillow pytesseract # Python Imaging Library and Tesseract wrapper
!sudo apt-get install tesseract-ocr # Install Tesseract OCR engine (system-wide)

# For Video processing
!pip install moviepy SpeechRecognition # Extract audio from video, perform speech-to-text
!pip install openai-whisper # For a more advanced, local Whisper transcription
!sudo apt-get install ffmpeg # Install FFmpeg (system-wide)

# Installing LangChain
!pip install -U langchain-community

# For OpenRouter Deepseek LLM
!pip install langchain_openai openai

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


# 2. Prepare Your Data Directories

In [50]:
import os

# Define main data directory
media_data_dir = 'multimedia_rag_data'
images_dir = os.path.join(media_data_dir, 'images')
videos_dir = os.path.join(media_data_dir, 'videos')

# Create directories if they don't exist
for d in [media_data_dir, images_dir, videos_dir]:
    if not os.path.exists(d):
        os.makedirs(d)
        print(f"Directory '{d}' created.")
    else:
        print(f"Directory '{d}' already exists.")

# Example: Place your .jpg, .png, .mp4, .avi files into these directories
# You can upload them manually in Colab or mount Google Drive.

Directory 'multimedia_rag_data' already exists.
Directory 'multimedia_rag_data/images' already exists.
Directory 'multimedia_rag_data/videos' already exists.


# 3. Extract Text Content from Media Files

3.1. Image Processing (OCR & Captioning)

In [52]:
from PIL import Image
import pytesseract
from langchain.docstore.document import Document

def extract_text_from_image(image_path: str) -> str:
    """Extracts text from an image using Tesseract OCR."""
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        return text.strip()
    except Exception as e:
        print(f"Error processing image {image_path} with OCR: {e}")
        return ""

def process_image_for_rag(image_path: str) -> Document:
    """Processes an image, extracts text, and returns a LangChain Document."""
    extracted_text = extract_text_from_image(image_path)
    # You could add a simple prompt to Deepseek here for image captioning
    # if you don't find enough text via OCR and want a description.
    # For now, we rely on OCR text.

    # Create a descriptive content string for the document
    content = f"Image file: {os.path.basename(image_path)}\n"
    if extracted_text:
        content += f"Extracted text from image: {extracted_text}\n"
    else:
        content += "No readable text found via OCR in this image.\n"
        # Placeholder for where image captioning would go if you had a VLM
        content += "Consider this image's content based on its visual appearance if a description were available."

    return Document(page_content=content, metadata={"source": image_path, "type": "image"})

all_image_documents = []
for filename in os.listdir(images_dir):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        image_path = os.path.join(images_dir, filename)
        doc = process_image_for_rag(image_path)
        all_image_documents.append(doc)
        print(f"Processed image: {filename}")

print(f"Total image documents created: {len(all_image_documents)}")

Processed image: 9.jpg
Processed image: 10.jpg
Processed image: 11.jpg
Total image documents created: 3


3.2. Video Processing (Audio Transcription)

In [53]:
import moviepy.editor as mp
import speech_recognition as sr
# If you want to use OpenAI Whisper (local, better quality but larger model download)
# !pip install -q transformers # for whisper-large-v3, if you use it
# from transformers import pipeline

def extract_audio_from_video(video_path: str, audio_output_path: str):
    """Extracts audio from a video file."""
    try:
        video = mp.VideoFileClip(video_path)
        video.audio.write_audiofile(audio_output_path, verbose=False, logger=None)
        return True
    except Exception as e:
        print(f"Error extracting audio from {video_path}: {e}")
        return False

def transcribe_audio(audio_path: str) -> str:
    """Transcribes audio to text using Google Web Speech API (free, online) or local Whisper."""
    # Option A: Google Web Speech API (requires internet, rate limits apply)
    r = sr.Recognizer()
    try:
        with sr.AudioFile(audio_path) as source:
            audio_data = r.record(source)
        text = r.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
        return ""
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return ""
    except Exception as e:
        print(f"Error during audio transcription: {e}")
        return ""

    # Option B: Local Whisper (larger download, slower on CPU, but higher quality and no API key)
    # To use Whisper:
    # 1. Uncomment the !pip install transformers and from transformers.pipeline lines above.
    # 2. Uncomment the code below and comment out Option A.
    # try:
    #     print("Loading Whisper model (first time may take a while)...")
    #     # Use a smaller model like 'tiny.en' or 'base.en' for faster local processing if needed
    #     transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")
    #     result = transcriber(audio_path)
    #     return result["text"]
    # except Exception as e:
    #     print(f"Error with Whisper transcription: {e}")
    #     return ""

def process_video_for_rag(video_path: str) -> Document:
    """Processes a video, transcribes audio, and returns a LangChain Document."""
    audio_output_path = video_path + ".wav"
    transcribed_text = ""
    if extract_audio_from_video(video_path, audio_output_path):
        transcribed_text = transcribe_audio(audio_output_path)
        os.remove(audio_output_path) # Clean up audio file

    content = f"Video file: {os.path.basename(video_path)}\n"
    if transcribed_text:
        content += f"Transcribed audio from video: {transcribed_text}\n"
    else:
        content += "No audio or could not transcribe audio from this video.\n"
        content += "Consider the visual content of the video if a description were available."

    return Document(page_content=content, metadata={"source": video_path, "type": "video"})

all_video_documents = []
for filename in os.listdir(videos_dir):
    if filename.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
        video_path = os.path.join(videos_dir, filename)
        doc = process_video_for_rag(video_path)
        all_video_documents.append(doc)
        print(f"Processed video: {filename}")

print(f"Total video documents created: {len(all_video_documents)}")

Could not request results from Google Speech Recognition service; recognition request failed: Bad Request
Processed video: overview.mp4
Processed video: cricket.mp4
Total video documents created: 2


3.3. Combine All Documents and Split into Chunks

In [54]:
# Assuming 'documents' from your previous PDF/text loading step is available.
# If not, you'd load PDFs/text here again, similar to previous guides.
# For this example, let's just combine:
all_rag_documents = all_image_documents + all_video_documents
# If you have original PDF/text documents from a previous run:
# all_rag_documents = all_image_documents + all_video_documents + documents # Add your previously loaded PDF/text docs

# Split the combined documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(all_rag_documents)

print(f"Total chunks from all multimedia data: {len(chunks)}")
print(f"Example chunk metadata: {chunks[0].metadata}") # Check 'type' and 'source'

Total chunks from all multimedia data: 5
Example chunk metadata: {'source': 'multimedia_rag_data/images/9.jpg', 'type': 'image'}


# 4. Create Embeddings with Hugging Face Model

In [55]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

print(f"Hugging Face embedding model '{model_name}' configured for all data types.")

Hugging Face embedding model 'sentence-transformers/all-MiniLM-L6-v2' configured for all data types.


# 5. Store Embeddings in ChromaDB

In [56]:
from langchain.vectorstores import Chroma

# Create a ChromaDB vector store
vector_store = Chroma.from_documents(
    chunks,
    embeddings
)

print("All multimedia chunks successfully embedded and stored in ChromaDB.")

All multimedia chunks successfully embedded and stored in ChromaDB.


# 6. Query and Retrieve from Embeddings

6.1. Set up the Retriever

In [57]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5} # Retrieve top 5 most similar chunks
)

print("Retriever configured for multimedia RAG.")

Retriever configured for multimedia RAG.


6.2. Integrate with Your Deepseek LLM

In [58]:
from openai import OpenAI
from google.colab import userdata
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Initialize the OpenRouter client for Deepseek
llm = ChatOpenAI(
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key=userdata.get("OPENROUTER_API_KEY"),
    model_name="deepseek/deepseek-chat-v3-0324:free",
    temperature=0
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("QA chain with Deepseek LLM ready for multimedia queries.")

QA chain with Deepseek LLM ready for multimedia queries.


# 7. Test and Validate the Multimedia Pipeline

In [59]:
# --- Test Queries ---

# Query 1: Ask about content that might be in an image (e.g., text from a sign)
query1 = "What is written in the image files?"
response1 = qa_chain.invoke({"query": query1})
print(f"\n--- Query 1 (Image-focused) ---")
print(f"Question: {query1}")
print(f"Answer: {response1['result']}")
print("\nSource documents:")
for i, doc in enumerate(response1["source_documents"]):
    source_info = doc.metadata.get('source', 'Unknown Source')
    type_info = doc.metadata.get('type', 'Unknown Type')
    print(f"- Doc {i+1} (Source: {source_info}, Type: {type_info}): {doc.page_content[:200]}...")

# Query 2: Ask about content that might be in a video (e.g., spoken dialogue)
query2 = "What type of discussion is present in the video files?"
response2 = qa_chain.invoke({"query": query2})
print(f"\n--- Query 2 (Video-focused) ---")
print(f"Question: {query2}")
print(f"Answer: {response2['result']}")
print("\nSource documents:")
for i, doc in enumerate(response2["source_documents"]):
    source_info = doc.metadata.get('source', 'Unknown Source')
    type_info = doc.metadata.get('type', 'Unknown Type')
    print(f"- Doc {i+1} (Source: {source_info}, Type: {type_info}): {doc.page_content[:200]}...")

# Query 3: A mixed query (if you also have PDFs/text files in your 'multimedia_rag_data')
# For this to work, make sure you combined all_image_documents, all_video_documents, and your PDF/text 'documents' earlier.
# query3 = "Summarize the points made about the new product launch across all documents, including presentations and video announcements."
# response3 = qa_chain.invoke({"query": query3})
# print(f"\n--- Query 3 (Mixed) ---")
# print(f"Question: {query3}")
# print(f"Answer: {response3['result']}")
# print("\nSource documents:")
# for i, doc in enumerate(response3["source_documents"]):
#     source_info = doc.metadata.get('source', 'Unknown Source')
#     type_info = doc.metadata.get('type', 'Unknown Type')
#     print(f"- Doc {i+1} (Source: {source_info}, Type: {type_info}): {doc.page_content[:200]}...")


--- Query 1 (Image-focused) ---
Question: What is written in the image files?
Answer: The only image file with readable text is **11.jpg**, which contains the following text:  

```
Neel  
as ee  

| Zsa abies  

   

ort  

May  
```  

The other images (**1.jpg** and **5.jpg**) do not have any readable text extracted via OCR. Let me know if you'd like further analysis or interpretation of the text from **11.jpg**.

Source documents:
- Doc 1 (Source: multimedia_rag_data/images/1.jpg, Type: image): Image file: 1.jpg
No readable text found via OCR in this image.
Consider this image's content based on its visual appearance if a description were available....
- Doc 2 (Source: multimedia_rag_data/images/5.jpg, Type: image): Image file: 5.jpg
No readable text found via OCR in this image.
Consider this image's content based on its visual appearance if a description were available....
- Doc 3 (Source: multimedia_rag_data/images/11.jpg, Type: image): Image file: 11.jpg
Extracted text from ima