In [8]:
!pip install bitsandbytes-cuda118

ERROR: Could not find a version that satisfies the requirement bitsandbytes-cuda118 (from versions: none)
ERROR: No matching distribution found for bitsandbytes-cuda118


In [None]:
!pip install langchain_community whisperspeech

In [None]:
!pip install numpy==1.22

In [None]:
# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
!pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl

In [None]:
!pip install intel_extension_for_pytorch
!git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
!pip install -e .

In [None]:
!pip install --force-reinstall -r requirements.txt

In [1]:
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
pipeline
)

import torch
import torch.nn.functional as F

import os

from langchain.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings

import gradio as gr

import whisper
from whisperspeech.pipeline import Pipeline

import uuid

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_llm():

    #Loading the Mistral Model
    model_name='mistralai/Mistral-7B-Instruct-v0.2'
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )


    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
    )

    # Building a LLM text-generation pipeline
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.2,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=1024,
        device_map = 'auto',
    )


    return text_generation_pipeline

In [3]:
def text_splitter():
    # Simulate some document processing delay
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter

In [4]:
def add_pdfs_to_vectorstore(files):

    saved_files_count = 0
    documents = []
    for file_path in files:
        file_name = os.path.basename(file_path)  # Extract the filename from the full path
        if file_name.lower().endswith('.pdf'):  # Check if the file is a PDF
            saved_files_count += 1
            loader_temp = PyPDFLoader(file_path)
            docs_temp = loader_temp.load_and_split(text_splitter=textsplitter)
            for doc in docs_temp:
                # Replace all occurrences of '\n' with a space ' '
                doc.page_content = doc.page_content.replace('\n', ' ')
            documents = documents + docs_temp

        else:
            print(f"Skipping non-PDF file: {file_name}")
            
    global qdrant
    
    qdrant = Qdrant.from_documents(
        documents,
        HuggingFaceEmbeddings(),
        location=":memory:", 
    )

    return f"Added {saved_files_count} PDF file(s) to vectorstore/ You can begin voice chat"

In [5]:
def answer_query(message):
    context_docs = qdrant.similarity_search(message, k= 10)
    context = ' '.join(doc.page_content for doc in context_docs)

    template = f"""Answer the question based only on the following context:
        {context}

        Question: {message}
    """

    result = llm(template)

    answer = result[0]["generated_text"].replace(template, '')

    return answer

In [6]:
from huggingface_hub import login

# Replace 'your_hf_token' with your actual token
login(token='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')

In [8]:
whisper_model = whisper.load_model("base")
whisper_speech_model = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')
llm = load_llm()
textsplitter = text_splitter()

Fetching 3 files: 100%|██████████| 3/3 [23:30<00:00, 470.30s/it] 
Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


AttributeError: function 'cquantize_blockwise_fp16_nf4' not found

In [None]:
def generate_and_play_audio(text):
    # Construct the directory and filename
    directory = '/var/tmp/gradio/'
    filename = str(uuid.uuid4()) + "/audio.wav"
    file_location = os.path.join(directory, filename)
    
    # Ensure that the directory exists
    os.makedirs(os.path.dirname(file_location), exist_ok=True)
    
    # Generate the audio file from text and save to the specified location
    whisper_speech_model.generate_to_file(file_location, text, lang ='en', cps=15)

    # Return the location of the saved audio file for playback
    return file_location

In [None]:
def transcribe(audio):
    
    print(audio)
    result = whisper_model.transcribe(audio)
    return result["text"]


                    

In [None]:
with gr.Blocks() as demo:
    
    with gr.Row():
        upload_files = gr.File(label="Upload pdf files only", file_count='multiple')
        success_msg = gr.Text(value="")
    
    with gr.Row():
        audio_inp = gr.Audio(sources="microphone", type='filepath')
        trans_out = gr.Textbox()
    
    with gr.Row():
        btn_audio = gr.Button("Submit Audio")
    
    with gr.Row():
        model_response = gr.Textbox(label= "Model Response", lines = 20)
        audio_out = gr.Audio(label="AI response in Voice")
        
    with gr.Row():
        clear_btn = gr.Button("Clear All")
        
    
    upload_files.upload(add_pdfs_to_vectorstore, upload_files, success_msg)
    transcribe = btn_audio.click(fn=transcribe, inputs=audio_inp, outputs=trans_out)
    answer_gen = transcribe.then(fn=answer_query, inputs= trans_out, outputs= model_response)
    answer_gen.then(fn=generate_and_play_audio, inputs= model_response, outputs= audio_out)
    clear_btn.click(fn= lambda: (None,"","",None), inputs= None, outputs= [audio_inp, trans_out, model_response, audio_out])

demo.queue().launch(server_name= "0.0.0.0", share=True)