In [21]:
# python 3.10
! pip install faiss-cpu==1.8.0 gpt4all==2.2.1.post1 pdfminer-six==20231228
! pip install -U sentence-transformers==2.5.1



In [22]:
! pip freeze

absl-py==2.1.0
asttokens==2.4.1
astunparse==1.6.3
cachetools==5.3.2
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
colorama==0.4.6
comm==0.2.1
contourpy==1.2.0
cryptography==42.0.5
cycler==0.12.1
debugpy==1.8.1
decorator==5.1.1
exceptiongroup==1.2.0
executing==2.0.1
faiss-cpu==1.8.0
filelock==3.13.1
flatbuffers==23.5.26
fonttools==4.48.1
fsspec==2024.2.0
gast==0.5.4
google-auth==2.27.0
google-auth-oauthlib==1.2.0
google-pasta==0.2.0
gpt4all==2.2.1.post1
grpcio==1.60.1
h5py==3.10.0
huggingface-hub==0.21.4
idna==3.6
install==1.3.5
ipykernel==6.29.3
ipython==8.22.2
jedi==0.19.1
Jinja2==3.1.3
joblib==1.3.2
jupyter_client==8.6.0
jupyter_core==5.7.1
keras==2.15.0
kiwisolver==1.4.5
libclang==16.0.6
Markdown==3.5.2
MarkupSafe==2.1.5
matplotlib==3.8.2
matplotlib-inline==0.1.6
ml-dtypes==0.2.0
mpmath==1.3.0
nest-asyncio==1.6.0
networkx==3.2.1
numpy==1.26.4
oauthlib==3.2.2
opencv-python==4.9.0.80
opt-einsum==3.3.0
packaging==23.2
pandas==2.2.0
parso==0.8.3
pdfminer==20191125
pdfminer.si

In [25]:
import re
import copy
import faiss
import numpy as np
from gpt4all import GPT4All
from pdfminer.high_level import extract_pages
from sentence_transformers import SentenceTransformer
from pdfminer.layout import LTTextContainer, LTChar,LTTextLine

In [26]:
data_path = "C:/Users/Admin/Downloads/ATT-F700U-EN-UM-TN-TAH-020420-FINAL-WEB.pdf"
max_len = 500
max_tokens=200

In [27]:
def retrieve_data(path):
    Extract_Data=[]
    font_sizes=[]

    for page_layout in extract_pages(path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line , LTTextLine):
                        for character in text_line:
                            if isinstance(character, LTChar):
                                Font_size=character.size
                                font_name=character.fontname
                Extract_Data.append([Font_size,font_name,(element.get_text()),len(element.get_text())])
                font_sizes.append(Font_size)

    appended_data = copy.deepcopy(Extract_Data)
    corpus_of_documents = [lis[2] for lis in appended_data]
    return corpus_of_documents

In [28]:
def merge_texts(texts, max_len):
    merged_text = ''
    for text in texts:
        if len(merged_text + text) <= max_len:
            merged_text += text
        else:
            break
    return merged_text

In [29]:
def preprocess_extracted_data(extracted_text):
    merged_texts = []
    current_text = []
    current_length = 0

    for text in extracted_text:
        if current_length + len(text) <= max_len:
            current_text.append(text)
            current_length += len(text)
        else:
            merged_texts.append(merge_texts(current_text, max_len))
            current_text = [text]
            current_length = len(text)

    merged_texts.append(merge_texts(current_text, max_len))

    filtered_list = [re.sub(r'\s+', ' ', text.replace('\n',' ').strip()) for text in merged_texts if text.strip()]
    return filtered_list

In [38]:
extracted_text = retrieve_data(data_path)
print(extracted_text)


corpus_of_documents = preprocess_extracted_data(extracted_text)



In [39]:
corpus_of_documents

['User manual Contents Features Mobile continuity | Wireless PowerShare | Bixby | Biometric security | Dark mode Get started Galaxy Z Flip: Folded | Unfolded Set up your device: Charge the battery | Wireless PowerShare',
 'Start using your device: Turn on your device | Use the Setup Wizard | Transfer data from an old device | Lock or unlock your device | Cover screen | Side key settings | Accounts | Set up voicemail | Navigation | Navigation bar | Customize your home screen | Samsung Daily | Bixby | Digital wellbeing and parental controls | Always On Display | Biometric security | Mobile continuity | Multi window | Enter text | Emergency mode',
 'Customize your home screen: App icons | Create and use folders | Wallpaper | Themes | Icons | Widgets | Home screen settings | Easy mode | Status bar | Notification panel Camera and Gallery Camera: Navigate the camera screen | Configure shooting mode | AR Zone | Live focus | Scene optimizer | Record videos | Live focus video | Zoom-in mic | Su

In [31]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')
vectors = model.encode(corpus_of_documents)
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [32]:
def search(user_input):
    search_vector = model.encode(user_input)
    _vector = np.array([search_vector])
    faiss.normalize_L2(_vector)
    k = index.ntotal
    distances, ann = index.search(_vector, k=k)
    dis , anns = distances[0] , ann[0]
    return dis , anns 

In [33]:
user_input = "Tell me about camera"
dis , ann = search(user_input)
relevant_document = corpus_of_documents[ann[0]]
print("relevant data >>>>>>> ",relevant_document)


relevant data >>>>>>>  You can capture high-quality pictures and videos using the Camera app. Images and videos are stored in the Gallery, where you can view and edit them. Camera Navigate the camera screen | Configure shooting mode | AR Zone | Live focus | Scene optimizer | Record videos | Live focus video | Zoom-in mic | Super Slow-mo | Super steady | Camera settings Gallery


In [34]:
prompt = """
[INST] 
Answer the question based only on the following relevant document in {max_tokens} words:
{relevant_document}

Question: {user_input}
[/INST]
"""

In [37]:

text = prompt.format(user_input=user_input, relevant_document=relevant_document,max_tokens=max_tokens)

print("input text to llm >> ", text)

gpt_model = GPT4All("C:/Users/Admin/Downloads/mistral-7b-instruct-v0.1.Q4_K_M.gguf", allow_download= False)
llm_output = gpt_model.generate(prompt=text, max_tokens=max_tokens, temp=0.1)
print("Mistral Output : ",llm_output)


input text to llm >>  
[INST] 
Answer the question based only on the following relevant document in 200 words:
You can capture high-quality pictures and videos using the Camera app. Images and videos are stored in the Gallery, where you can view and edit them. Camera Navigate the camera screen | Configure shooting mode | AR Zone | Live focus | Scene optimizer | Record videos | Live focus video | Zoom-in mic | Super Slow-mo | Super steady | Camera settings Gallery

Question: Tell me about camera
[/INST]

Mistral Output :  The document describes a camera app that allows users to capture high-quality pictures and videos. The images and videos are stored in the gallery, where they can be viewed and edited. The camera has several features such as shooting mode configuration, AR zone, live focus, scene optimizer, video recording, live focus video recording, zoom-in mic, super slow-mo, and super steady. These features allow users to customize their photography and videography experience accor

: 

: 