In [2]:
import re
import copy
import faiss
import numpy as np
from gpt4all import GPT4All
from pdfminer.high_level import extract_pages
from sentence_transformers import SentenceTransformer
from pdfminer.layout import LTTextContainer, LTChar,LTTextLine

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_path = 'E:/pdf folder for RAG/samsung-galaxy-z-flip-5-user-manual-SAM-F946-F731-UM-OS13-US-English.pdf'
max_len = 500
max_tokens=200

In [4]:
def retrieve_data(path):
    Extract_Data=[]
    font_sizes=[]

    for page_layout in extract_pages(path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line , LTTextLine):
                        for character in text_line:
                            if isinstance(character, LTChar):
                                Font_size=character.size
                                font_name=character.fontname
                Extract_Data.append([Font_size,font_name,(element.get_text()),len(element.get_text())])
                font_sizes.append(Font_size)

    appended_data = copy.deepcopy(Extract_Data)
    corpus_of_documents = [lis[2] for lis in appended_data]
    return corpus_of_documents

In [5]:
def merge_texts(texts, max_len):
    merged_text = ''
    for text in texts:
        if len(merged_text + text) <= max_len:
            merged_text += text
        else:
            break
    return merged_text

In [6]:
def preprocess_extracted_data(extracted_text):
    merged_texts = []
    current_text = []
    current_length = 0

    for text in extracted_text:
        if current_length + len(text) <= max_len:
            current_text.append(text)
            current_length += len(text)
        else:
            merged_texts.append(merge_texts(current_text, max_len))
            current_text = [text]
            current_length = len(text)

    merged_texts.append(merge_texts(current_text, max_len))

    filtered_list = [re.sub(r'\s+', ' ', text.replace('\n',' ').strip()) for text in merged_texts if text.strip()]
    return filtered_list

In [7]:
extracted_text = retrieve_data(data_path)
corpus_of_documents = preprocess_extracted_data(extracted_text)

In [8]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')
vectors = model.encode(corpus_of_documents)
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

In [9]:
def search(user_input):
    search_vector = model.encode(user_input)
    _vector = np.array([search_vector])
    faiss.normalize_L2(_vector)
    k = index.ntotal
    distances, ann = index.search(_vector, k=k)
    dis , anns = distances[0] , ann[0]
    return dis , anns 

In [10]:
user_input = "Tell me about camera"
dis , ann = search(user_input)
relevant_document = corpus_of_documents[ann[0]]
print("relevant data >>>>>>> ",relevant_document)


relevant data >>>>>>>  • About Camera: Display app and software information. • Contact us: Contact Samsung support through Samsung Members. 62 Camera and Gallery Gallery You can edit and manage pictures and videos. Go to the Gallery to access the visual media stored on your device. ◌ From Apps, tap Gallery. Devices and software are constantly evolving — illustrations are for reference only. 63 Camera and Gallery Open pictures Pictures stored on your device are accessible in the Gallery app.


In [11]:
prompt = """
[INST] 
Answer the question based only on the following relevant document in {max_tokens} words:
{relevant_document}

Question: {user_input}
[/INST]
"""

In [12]:

text = prompt.format(user_input=user_input, relevant_document=relevant_document,max_tokens=max_tokens)

print("input text to llm >> ", text)

gpt_model = GPT4All("mistral-7b-instruct-v0.1.Q4_K_M", allow_download= False)
llm_output = gpt_model.generate(prompt=text, max_tokens=max_tokens, temp=0.1)
print("Mistral Output : ",llm_output)


input text to llm >>  
[INST] 
Answer the question based only on the following relevant document in 200 words:
• About Camera: Display app and software information. • Contact us: Contact Samsung support through Samsung Members. 62 Camera and Gallery Gallery You can edit and manage pictures and videos. Go to the Gallery to access the visual media stored on your device. ◌ From Apps, tap Gallery. Devices and software are constantly evolving — illustrations are for reference only. 63 Camera and Gallery Open pictures Pictures stored on your device are accessible in the Gallery app.

Question: Tell me about camera
[/INST]

Mistral Output :  The Samsung Camera app is a tool that allows users to capture, edit, and manage photos and videos on their devices. To access the camera app, go to "Apps" on your device and tap "Camera." The app provides various features such as zoom, flash, and filters for enhancing your photos. Additionally, you can use the camera app to record videos and switch betwee