<a href="https://colab.research.google.com/github/mehrdadhz-77/SearchEngine-Summarizer/blob/main/Search_engine_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the necessary packages

In [3]:
!pip install openai PyMuPDF # to read pdf
!pip install nltk # for natural language processing to clean the text
!pip install faiss-cpu
!pip install faiss # to perform the similarity check using the facebook similarity search
import nltk
import fitz  # PyMuPDF
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from openai import OpenAI

# pretrained embedding model
from sentence_transformers import SentenceTransformer
import faiss, numpy as np

# term frequency per page
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# set the api key and creating the gpt client
api_key = '<your_api>'
client = OpenAI(api_key=api_key)

[31mERROR: Could not find a version that satisfies the requirement faiss (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss[0m[31m
[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Loading the data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# setting the root path
root_path = '/content/drive/MyDrive/Python_Students_SuperProf/Darren/files/'

# file 1 path
file1_path = '2024 Contractor Safety Manual v2024.1.pdf'

Mounted at /content/drive


## Load content function

This function given the file path, will load the file and create a dictionary containing the content and metadata of the pages within the document

In [5]:
# this function given the file_path to the file
# will return the text/content of the file
def create_corpus(file_path):

  # extract the file name from the path
  file_name = file_path.split('/')[-1]

  # keeping two lists, one for the content and the other one
  # for the metadata regarding that content
  pages, meta_data = [], []

  # open the document
  doc = fitz.open(file_path)

  # go over each page in the document
  for page_num, page in enumerate(doc):

    # read the content of the whole page
    page_text = page.get_text()

    # add the content of this page to the list
    pages.append(page_text)

    # add the metadata for this page to the list
    meta_data.append({"document": file_name , "page_num": page_num})

  # return the list of pages contents and the metadata
  # related to each page
  return pages, meta_data


# get the pages and the meta data of the first file
file1_pages, file1_metadata = create_corpus(root_path + file1_path)

# Creating tf-idf and embeddings

At this point given the pages content we will create two presentations

1. tf-idf which is the term frequency and existence per page
2. creating embedding for each page using a pre-trained model

In [13]:
# creating the tfidf_obj
tfidf_obj = TfidfVectorizer()

# create the tfidf vectors
file1_tfidf = tfidf_obj.fit_transform(file1_pages)

# a pretrained model to create page embeddings based on the tokens
# appeared in the pages
embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", trust_remote_code=True)

# get the embeddings from the model given the pages content
page_embeddings  = embedding_model.encode(file1_pages, convert_to_numpy=True, normalize_embeddings=True)

## FAISS indexing

creating the indexes based on the page embeddings for matching to the query

In [14]:
# creating the indexes based on the page embeddings
# for the comparison
index = faiss.IndexFlatIP(page_embeddings.shape[1])
index.add(page_embeddings)

## Search function + relevant retrieval

This function given the query and the objects and information necessary will return the pages that were relevant to the query

In [44]:
# given the query and additional information, will return the pages tha tare relevant to the query
def hybrid_search(query, pages_metadata, tfidf_mat, tfidf_obj, page_embeddings, embedding_model, faiss_index, alpha=0.5, k=10):

    # create the tfidf for the query - lexical part
    q_bow = tfidf_obj.transform([query])

    # create the embeddings for the query - semantic part
    q_emb = embedding_model.encode([query], normalize_embeddings=True).astype('float32')

    # find the top k*5 most related documents lexically
    bow_scores = (q_bow @ tfidf_mat.T).toarray().ravel()
    bow_idx    = bow_scores.argsort()[::-1][:k * 5]

    # find the top k*5 top most related documents semantically
    _, dense_idx = faiss_index.search(q_emb, k * 5)
    dense_idx = dense_idx[0]

    # taking the candidates proposed by these 2 approaches
    candidates = np.unique(np.concatenate([dense_idx, bow_idx]))

    # compute the final score for the proposed candidates, a weightening approach
    # to prefer semantic or lexical retrieval
    dense_scores = (page_embeddings[candidates] @ q_emb.T).ravel()     # cosine/IP
    scores = alpha * bow_scores[candidates] + (1 - alpha) * dense_scores

    # find the top k most relevant pages to the query
    order = scores.argsort()[::-1][:k]
    top_ids = candidates[order] # get their page number
    top_scores = scores[order] # get their scores

    # return the socre and the pages that are relevant to the query
    results = [pages_metadata[i] | {"score": float(s)} for i, s in zip(top_ids, top_scores)]
    return results


Passing the necessary information, retrieving the most relevant pages to the query

In [59]:
# set the query of the user
user_query = "prohibited materials"

# get the most relevant queries
relevant_pages = hybrid_search(query= user_query, pages_metadata=file1_metadata,
                        tfidf_mat=file1_tfidf,tfidf_obj=tfidf_obj,
                        page_embeddings=page_embeddings,faiss_index=index,
                        embedding_model=embedding_model)
relevant_pages

[{'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 45,
  'score': 0.47815319456145944},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 51,
  'score': 0.3584377305109834},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 2,
  'score': 0.3550077685604948},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 39,
  'score': 0.3529007066313828},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 28,
  'score': 0.3473269376948872},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 27,
  'score': 0.33917852127782},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 17,
  'score': 0.33883731327023703},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 22,
  'score': 0.33492576352679143},
 {'document': '2024 Contractor Safety Manual v2024.1.pdf',
  'page_num': 35,
  'score': 0.33136773066964464},
 {'document': '202

# ChatGPT api

These functions given the text or the file, will get the summary of the text

In [60]:
# this functin will send the text to the chatgpt and get the summary of the document
# in a structured manner
def summarize_text(text):
    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[
            {"role": "system", "content": "You are a document summarization assistant."},
            {"role": "user", "content": f"""Write me the summary of the following document and point out the key points.

              Return the output in the following structured JSON format:
                {{"summary": "<brief summary>",
                  "key_points": ["<point 1>", "<point 2>", "..."],
                  "section_titles": ["<optional section titles if available>"]}}

              Here is the document:
              {text}
             """}
        ],
        temperature=0.5,
    )
    return response.choices[0].message.content


# this function given the two text of documents will compare
# the content of the two documents
def compare_documents(text1, text2, file_name_1, file_name_2):
    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[
            {"role": "system", "content": "You are a document comparison assistant."},
            {"role": "user", "content": f"""Compare the following two documents and summarize their similarities and differences. Use the full name of the file name when you want to return to each of the documents in ''. Return the result in structured JSON format with the following fields:
                                         {{ "similarities": ["<point 1>", "<point 2>", "..."],
                                            "differences": ["<point 1>", "<point 2>", "..."],
                                            "overlapping_themes": ["<theme 1>", "..."]}}

                                            --- filename: {file_name_1} ---
                                            content: {text1}

                                            --- filename: {file_name_2} ---
                                            content: {text2}"""}
        ],
        temperature=0.5,
    )
    return response.choices[0].message.content


# this function given a text will remove the stopwords from the text in order to
# clean the text and reduce the size of it
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    #filtered_text = [word for word in words if word.lower() not in stop_words and word.isalpha()]
    filtered_text = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_text)


# Summary of the relevant pages

In [61]:
# this function given the list of pages content
# and the most relevant contents details will return
# the text of all of those pages
def relevant_pages_content(pages_content, relevant_pages):

  # put together the text of the most relevant queris
  relevant_pages_text = """ """

  # go over each page and put together the content of the page
  for relevant_page in relevant_pages:

    # get the page number
    page_num = relevant_page['page_num']

    # get the text of that page
    page_text = pages_content[page_num]

    # add the text to the previous content of the text
    relevant_pages_text += page_text + '\n'

    return remove_stopwords(relevant_pages_text)


# get the text of all of the relevant pages
relevant_text = relevant_pages_content(file1_pages, relevant_pages)


# get the summary of the relevant text
print(summarize_text(relevant_text))

{
  "summary": "The Contractor Safety Manual outlines safety and conduct policies for contractors working with SoCalGas. It prohibits animals, dangerous reptiles, and the use of certain equipment in facilities. The manual bans illegal drugs, firearms, explosives, and alcohol on company property and reserves the right to inspect for prohibited materials. Photography and recording devices are allowed with prior approval.",
  "key_points": [
    "Prohibition of animals, dangerous reptiles, and certain equipment in facilities.",
    "Strict ban on illegal drugs, firearms, explosives, and alcohol on SoCalGas property.",
    "SoCalGas reserves the right to inspect brought-in materials for prohibited items.",
    "Photography and audio/video recording devices require prior approval."
  ],
  "section_titles": ["Prohibited Materials", "Photography/Camera Devices"]
}
