In [22]:
import os
import time
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from PyPDF2 import PdfReader
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from faster_whisper import WhisperModel
from gtts import gTTS
from groq import Groq
from playsound import playsound 

class RAGVoiceBot:
    def __init__(self, vector_db_path, knowldge_path, groq_token_path, whisper_size='base', model_name='llama-3.1-70b-versatile'):

        self.load_groq_token(groq_token_path)
        
        self.embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L12-v2')
        self.whisper_model = self.initialize_STT_model(whisper_size)
        self.llm = self.initialize_llm(model_name)
        self.qa_pipeline = self.initialize_qa_pipeline(vector_db_path, knowldge_path)

    
    def load_groq_token(self, groq_token_path):
        with open(groq_token_path, 'r') as f:
            os.environ["GROQ_API_KEY"] = f.readline().strip()

    def initialize_STT_model(self, whisper_size):
        num_cores = os.cpu_count() // 2
        return WhisperModel(
            whisper_size,
            device="cpu",
            compute_type="int8",
            cpu_threads=num_cores
        )

    def initialize_llm(self, model_name):
        return ChatGroq(
            model=model_name
        )

    def initialize_qa_pipeline(self, vector_db_path, knowldge_path):
        if os.path.exists(vector_db_path):
            print('Loading exisiting vector database')
            vec_db = FAISS.load_local(vector_db_path, self.embedding_model, allow_dangerous_deserialization=True)
        else:
            print('Creating new vector database')
            vec_db = self.create_vector_database_docs(knowldge_path)
            vec_db.save_local(vector_db_path)


        # The RAG pipeline
        return RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type='stuff',
            retriever=vec_db.as_retriever(),
            return_source_documents=True
        )

    def create_vector_database_docs(self, knowldge_path):
        loader = PyPDFLoader(knowldge_path)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
        docs = text_splitter.split_documents(documents)
        return FAISS.from_documents(docs, self.embedding_model)
        

    def speach_to_text(self, audio_path):
        segments, _ = self.whisper_model.transcribe(audio_path)
        return ''.join(segment.text for segment in segments)

    def generate_response(self, prompt):
        return self.qa_pipeline.invoke(prompt)['result']

    def text_to_speech(self, text, output_path='./output_voices/speech.mp3'):
        print(output_path)
        start = time.time()
        tts = gTTS(text, lang='ar')
        tts.save(output_path)
        print(f'Text to sound time: {time.time() - start}')

        # output_path = os.path.join(os.getcwd(), output_path)
        
        music = pyglet.media.load(output_path, streaming=False)
        music.play()
        os.remove(output_path)
                
    def process_audio_file(self, audio_path):
        start = time.time()
        transcription = self.speach_to_text(audio_path)
        print(f'Sound to text time: {time.time() - start}')

        start = time.time()
        response = self.generate_response(transcription)
        print(f'Model response time: {time.time() - start}')
        
        
        self.text_to_speech(response)

        return transcription, response

audio_path = r"C:\Users\zmlka\Documents\Sound Recordings\Recording (3).m4a"

voice_bot = RAGVoiceBot(
    knowldge_path='./knowledge_base/Mohamed Hassan.pdf',
    groq_token_path='groq_token.txt',
    vector_db_path='vector_db'
)

transcription, response = voice_bot.process_audio_file(audio_path)

print('='*50)

print("USER:", transcription)
print("Assistant:", response)

Loading exisiting vector database
Sound to text time: 1.308748722076416
Model response time: 0.9331905841827393
./output_voices/speech.mp3
Text to sound time: 1.5953500270843506
USER:  السلام عليكم. كيف الحالة.
Assistant: وعليكم السلام ورحمة الله وبركاته. الحالة جيدة، شكراً لطرح السؤال. كيف أستطيع مساعدتك؟
