In [10]:
!pip install --force-reinstall --upgrade transformers accelerate bitsandbytes gradio langchain pypdf qdrant-client sentence-transformers torch torchaudio openai-whisper TTS flask langchain_community whisperspeech webdataset whisper torchvision

Collecting transformers (from -r requirements.txt (line 1))
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting accelerate (from -r requirements.txt (line 2))
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes (from -r requirements.txt (line 3))
  Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting gradio (from -r requirements.txt (line 4))
  Using cached gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting langchain (from -r requirements.txt (line 5))
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting pypdf (from -r requirements.txt (line 6))
  Using cached pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Collecting qdrant-client (from -r requirements.txt (line 7))
  Using cached qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)
Collecting sentence-transformers (from -r requirements.txt (line 8))
  Downloading sentence_transformers-4.1.0-py3-

In [4]:
!pip install intel_extension_for_pytorch
!git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git
!pip install -e ./bitsandbytes/

Collecting intel_extension_for_pytorch
  Downloading intel_extension_for_pytorch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading intel_extension_for_pytorch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl (104.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.7/104.7 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: intel_extension_for_pytorch
Successfully installed intel_extension_for_pytorch-2.7.0
Cloning into 'bitsandbytes'...
remote: Enumerating objects: 198, done.[K
remote: Counting objects: 100% (198/198), done.[K
remote: Compressing objects: 100% (187/187), done.[K
remote: Total 198 (delta 20), reused 98 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (198/198), 315.33 KiB | 9.55 MiB/s, done.
Resolving deltas: 100% (20/20), done.
Obtaining file:///content/bitsandbytes
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25h

In [15]:
!python --version

from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
pipeline
)

import transformers

import torch
import torch.nn.functional as F

import os

from langchain.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings

import gradio as gr

import whisper
import uuid

print("Transformers version:", transformers.__version__)
print("Torch version:", torch.__version__)
print("Gradio version:", gr.__version__)

Python 3.11.12
Transformers version: 4.51.3
Torch version: 2.7.0+cu126
Gradio version: 5.29.0


In [2]:
from whisperspeech.pipeline import Pipeline

In [3]:
def load_llm():

    #Loading the Mistral Model
    model_name='mistralai/Mistral-7B-Instruct-v0.2'
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )


    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
    )

    # Building a LLM text-generation pipeline
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.2,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=1024,
        device_map = 'auto',
    )


    return text_generation_pipeline

In [4]:
def text_splitter():
    # Simulate some document processing delay
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter

In [5]:
def add_pdfs_to_vectorstore(files):

    saved_files_count = 0
    documents = []
    for file_path in files:
        file_name = os.path.basename(file_path)  # Extract the filename from the full path
        if file_name.lower().endswith('.pdf'):  # Check if the file is a PDF
            saved_files_count += 1
            loader_temp = PyPDFLoader(file_path)
            docs_temp = loader_temp.load_and_split(text_splitter=textsplitter)
            for doc in docs_temp:
                # Replace all occurrences of '\n' with a space ' '
                doc.page_content = doc.page_content.replace('\n', ' ')
            documents = documents + docs_temp

        else:
            print(f"Skipping non-PDF file: {file_name}")

    global qdrant

    qdrant = Qdrant.from_documents(
        documents,
        HuggingFaceEmbeddings(),
        location=":memory:",
    )

    return f"Added {saved_files_count} PDF file(s) to vectorstore/ You can begin voice chat"

In [6]:
def answer_query(message):
    context_docs = qdrant.similarity_search(message, k= 10)
    context = ' '.join(doc.page_content for doc in context_docs)

    template = f"""Answer the question based only on the following context:
        {context}

        Question: {message}
    """

    result = llm(template)

    answer = result[0]["generated_text"].replace(template, '')

    return answer

In [10]:
from huggingface_hub import login

# Replace 'your_hf_token' with your actual token
login(token='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')

In [11]:
whisper_model = whisper.load_model("base")
whisper_speech_model = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')
llm = load_llm()
textsplitter = text_splitter()

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [12]:
def generate_and_play_audio(text):
    # Construct the directory and filename
    directory = '/var/tmp/gradio/'
    filename = str(uuid.uuid4()) + "/audio.wav"
    file_location = os.path.join(directory, filename)

    # Ensure that the directory exists
    os.makedirs(os.path.dirname(file_location), exist_ok=True)

    # Generate the audio file from text and save to the specified location
    whisper_speech_model.generate_to_file(file_location, text, lang ='en', cps=15)

    # Return the location of the saved audio file for playback
    return file_location

In [13]:
def transcribe(audio):

    print(audio)
    result = whisper_model.transcribe(audio)
    return result["text"]




In [14]:
with gr.Blocks() as demo:

    with gr.Row():
        upload_files = gr.File(label="Upload pdf files only", file_count='multiple')
        success_msg = gr.Text(value="")

    with gr.Row():
        audio_inp = gr.Audio(sources="microphone", type='filepath')
        trans_out = gr.Textbox()

    with gr.Row():
        btn_audio = gr.Button("Submit Audio")

    with gr.Row():
        model_response = gr.Textbox(label= "Model Response", lines = 20)
        audio_out = gr.Audio(label="AI response in Voice")

    with gr.Row():
        clear_btn = gr.Button("Clear All")


    upload_files.upload(add_pdfs_to_vectorstore, upload_files, success_msg)
    transcribe = btn_audio.click(fn=transcribe, inputs=audio_inp, outputs=trans_out)
    answer_gen = transcribe.then(fn=answer_query, inputs= trans_out, outputs= model_response)
    answer_gen.then(fn=generate_and_play_audio, inputs= model_response, outputs= audio_out)
    clear_btn.click(fn= lambda: (None,"","",None), inputs= None, outputs= [audio_inp, trans_out, model_response, audio_out])

demo.queue().launch(server_name= "0.0.0.0", share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0a5d00507e6f9b301b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


