In [1]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader


def generate_embeddings():
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L12-v2')
    return embedding_model

from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    raw_text = ""
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text
    return raw_text
    
def create_vector_database_texts(raw_text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024, chunk_overlap=100
    )

    texts = text_splitter.split_text(raw_text)
    vec_db = FAISS.from_texts(texts, generate_embeddings())
    return vec_db

def create_vector_database_docs(url):
    loader = PyPDFLoader(url)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)

    vec_db = FAISS.from_documents(docs, generate_embeddings())
    return vec_db

In [1]:
# !pip install -r requirements.txt

In [3]:
db = create_vector_database_docs('Mohamed Hassan.pdf')

  embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L12-v2')
  from tqdm.autonotebook import tqdm, trange
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [4]:
import os

with open('groq_token.txt', 'r') as f:
    groq_token = f.readline()
    
os.environ["GROQ_API_KEY"] = groq_token

In [5]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-70b-versatile",
)


from langchain.chains import RetrievalQA

qa_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever(),
    return_source_documents=True
)

In [40]:
response = qa_pipeline.invoke("اخبرني عن مناطق جميلة بجازان السعودية")
resutl = response['result']
# print(response)
print('='*50)
print(resutl)

جازان هي منطقة ساحلية في جنوب غرب المملكة العربية السعودية، تشتهر بالمناظر الطبيعية الخلابة والجزر الجميلة. بعض من أجمل المناطق السياحية في جازان تشمل:

1. جزر فرسان: مجموعة من الجزر الجميلة التي تقع قبالة الساحل الغربي لجازان. تشتهر الجزر بالشواطئ الرملية البيضاء والمياه الصافية والغوص.
2. وادي لجب: وادي جميل يقع في vùng جازان، يشتهر بالطبيعة الخلابة والمناظر الجبلية.
3. خوربارك: شاطئ رملية جميل يقع في مدينة جازان، يعرف بشواطئه الرملية البيضاء والمياه الصافية.
4. منتزه جبل دودا: منتزه جميل يقع على قمة جبل دودا، يعرف بالمناظر الخلابة والهواء النقي.
5. سوق جازان القديم: سوق تقليدي يقع في مدينة جازان، يعرف بالسلع التقليدية والمأكولات المحلية.

هذه بعض من المناطق الجميلة في جازان، هناك الكثير من الأماكن الأخرى التي يمكنك زيارتها أيضاً.


In [7]:
import os
from faster_whisper import WhisperModel
from gtts import gTTS
from groq import Groq
import time

# Set up the model configuration
whisper_size = 'base'  # Model size; options include 'tiny', 'small', 'medium', etc.
num_cores = os.cpu_count()  # Use half of available CPU cores if needed

# Initialize the Whisper model
whisper_model = WhisperModel(
    whisper_size,
    device="cpu",        # Use "cuda" for GPU, "cpu" for CPU
    compute_type="int8",   # Change to "float16" or "int8" as per your GPU capability
    cpu_threads=num_cores // 2  # Adjust based on your requirements
)

def groq_prompt(prompt, model='llama-3.1-70b-versatile'):
    grok_client = Groq(
        api_key='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'
    )
    chat_completion = grok_client.chat.completions.create(
        messages=[
            {
                'role': 'user',
                'content': prompt
            }
        ],
        model=model
    )
    response = chat_completion.choices[0].message.content
    return response
    
def wav_to_text(audio_path):
    segments, _ = whisper_model.transcribe(audio_path, language='ar')
    text = ''.join(segment.text for segment in segments)
    return text

# Example: Transcribe an audio file
audio_path = r"C:\Users\zmlka\Documents\Sound Recordings\Recording (3).m4a"  # Replace with the path to your audio file
start = time.time()
transcription = wav_to_text(audio_path)
print(f'Sound to text time: {time.time() - start}')

print("USER:", transcription)
start = time.time()
# response = groq_prompt(transcription)
response = qa_pipeline.invoke(transcription)['result']
print(f'Model response time: {time.time() - start}')
print('Assistant: ', response)

start = time.time()
tts = gTTS(response, lang='ar')
print(f'text to sound time: {time.time() - start}')
tts.save('speech.mp3')

Sound to text time: 1.2131483554840088
USER:  السلام عليكم كيف الحالة
Model response time: 1.6860651969909668
Assistant:  وعليكم السلام ورحمة الله وبركاته. الحالة جيدة، شكراً. كيف يمكنني مساعدتك؟
text to sound time: 0.0


In [1]:
import os
import time
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from PyPDF2 import PdfReader
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from faster_whisper import WhisperModel
from gtts import gTTS
from groq import Groq
from playsound import playsound 

class RAGVoiceBot:
    def __init__(self, vector_db_path, knowldge_path, groq_token_path, whisper_size='base', model_name='llama-3.1-70b-versatile'):

        self.load_groq_token(groq_token_path)
        
        self.embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L12-v2')
        self.whisper_model = self.initialize_STT_model(whisper_size)
        self.llm = self.initialize_llm(model_name)
        self.qa_pipeline = self.initialize_qa_pipeline(vector_db_path, knowldge_path)

    
    def load_groq_token(self, groq_token_path):
        with open(groq_token_path, 'r') as f:
            os.environ["GROQ_API_KEY"] = f.readline().strip()

    def initialize_STT_model(self, whisper_size):
        num_cores = os.cpu_count() // 2
        return WhisperModel(
            whisper_size,
            device="cpu",
            compute_type="int8",
            cpu_threads=num_cores
        )

    def initialize_llm(self, model_name):
        return ChatGroq(
            model=model_name
        )

    def initialize_qa_pipeline(self, vector_db_path, knowldge_path):
        if os.path.exists(vector_db_path):
            print('Loading exisiting vector database')
            vec_db = FAISS.load_local(vector_db_path, self.embedding_model, allow_dangerous_deserialization=True)
        else:
            print('Creating new vector database')
            vec_db = self.create_vector_database_docs(knowldge_path)
            vec_db.save_local(vector_db_path)


        # The RAG pipeline
        return RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type='stuff',
            retriever=vec_db.as_retriever(),
            return_source_documents=True
        )

    def create_vector_database_docs(self, knowldge_path):
        loader = PyPDFLoader(knowldge_path)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
        docs = text_splitter.split_documents(documents)
        return FAISS.from_documents(docs, self.embedding_model)
        

    def speach_to_text(self, audio_path):
        segments, _ = self.whisper_model.transcribe(audio_path)
        return ''.join(segment.text for segment in segments)

    def generate_response(self, prompt):
        return self.qa_pipeline.invoke(prompt)['result']

    def text_to_speech(self, text, output_path='speech.mp3'):
        start = time.time()
        tts = gTTS(text, lang='ar')
        tts.save(output_path)
        print(f'Text to sound time: {time.time() - start}')
        playsound('speech.mp3')
        os.remove('speech.mp3')
        
    def process_audio_file(self, audio_path):
        start = time.time()
        transcription = self.speach_to_text(audio_path)
        print(f'Sound to text time: {time.time() - start}')

        start = time.time()
        response = self.generate_response(transcription)
        print(f'Model response time: {time.time() - start}')
        
        
        self.text_to_speech(response)

        return transcription, response



audio_path = r"C:\Users\zmlka\Documents\Sound Recordings\Recording (3).m4a"
voice_bot = RAGVoiceBot(
    knowldge_path='Mohamed Hassan.pdf',
    groq_token_path='groq_token.txt',
    vector_db_path='vector_db'
)
transcription, response = voice_bot.process_audio_file(audio_path)
print("USER:", transcription)
print("Assistant:", response)

  self.embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L12-v2')


Loading exisiting vector database
Sound to text time: 1.276045560836792


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Model response time: 3.2341606616973877



    Error 259 for command:
        play speech.mp3 wait
    The driver cannot recognize the specified command parameter.


Text to sound time: 1.4821882247924805


PlaysoundException: 
    Error 259 for command:
        play speech.mp3 wait
    The driver cannot recognize the specified command parameter.