In [40]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np
import re

## Functions for loading the book and splitting by page

In [41]:
def load_book(file_path):
    """Loads in the book into a string

    Args:
        file_path (str): filepath of the book to be loaded

    Returns:
        str: the entire book loaded in as a str object
    """
    # Load the book
    with open(file_path, 'r') as f:
        text = f.read()
        
    #Return the text
    return text

def split_into_chapters(book):
    """Split a book into chapters by dividing up str by Chapter

    Args:
        book (str): book to be split up

    Returns:
        list: chapters of the book
    """
    #Split the string up by the word "CHAPTER"
    chapters = re.split(r"CHAPTER", book)  # Split by chapters
    chapters = [chap.strip() for chap in chapters if chap.strip()]
    
    #Return the chapters
    return chapters

def split_into_pages(chapter):
    """Splits the chapter up into pages

    Args:
        chapter (str): chapter that needs to be divided into pages

    Returns:
        list: pages (str) that the chapter has been divided into
    """
    #Split the chapter up by 3 newlines
    pages = chapter.split('\n\n\n')
    new_pages = []
    
    #Iterate through the pages and only take the first line of each page
    for page in pages:
        splitted = page.split('\n')
        new_pages.append(splitted[0])
    return pages

def split_entire_book_into_pages(book):
    """Takes in an entire book and splits it into pages with is associated chapter

    Args:
        book (_type_): the text to be read in and parsed

    Returns:
        dict: dictionary with the chapter number, page number, and content of the page
    """
    #Split the book into chapters
    chapters = split_into_chapters(book)[2:]
    
    #Contents start on page 9
    pages = []
    page_num = 9
    
    #Iterate through the chapters and split them into pages
    for chapter_number, chapter in enumerate(chapters):
        chapter_pages = split_into_pages(chapter)
        
        # Create a dictionary for each page that includes chapter and page information
        for page_number, page_content in enumerate(chapter_pages):
            pages.append({
                'chapter': chapter_number + 1,  # Chapter starts from 1 (adjusted from zero)
                'page': page_num,               # Page starts from 9 (adjusted from zero)
                'content': page_content
            })
            page_num += 1

    return pages, chapters

In [42]:
def clean_data(page_data):
    """Clean up the sentences and remove any extra spaces

    Args:
        page_data (str): page of data to be cleaned

    Returns:
        list: list of strings that are sentences
    """
    
    #Split the data on consecutive newlines (by page)
    clean_content = page_data['content'].split('\n\n')
    
    #Remove any extra spaces and the chapter headings
    if len(clean_content[0]) < 5:
        clean_content = clean_content[2:]
    else:
        clean_content = clean_content[1:]

    # Remove newlines and extra spaces and then rejoin the text
    clean_content = "".join(clean_content).replace("\n", "").strip()
    clean_content = re.sub(r'\s+', ' ', clean_content)

    # Split the text into sentences
    split_text = re.split(r'([.!?])', clean_content)
    split_text = [split_text[i] + split_text[i + 1] for i in range(0, len(split_text) - 1, 2)]

    # Include chapter and page info
    return [(sentence, page_data['chapter'], page_data['page']) for sentence in split_text]

def captialize_first_letter(text):
    """Capitalize the first letter of the text

    Args:
        text (str): text to be capitalized

    Returns:
        str: text with the first letter capitalized
    """
    return text[0].upper() + text[1:]

## Generate Embeddings

In [43]:
#Load the model
model = SentenceTransformer('all-mpnet-base-v2')
qa_pipeline = pipeline("question-answering")

# Generate embeddings for each segment (chapter or page)
def generate_embeddings(texts):
    return model.encode(texts)

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [44]:
def feed_through_pipeline(context, question):
    return qa_pipeline(question=question, context=context)

def search_pages(query, page_contents, top_k=3, chapter_number=None, page_range=None):

    #Filter by either chapter number of page range or just return all of the pages
    if chapter_number:
        filtered_pages = [page for page in page_contents if int(page['chapter']) == chapter_number]
    elif page_range:
        start_page, end_page = page_range
        filtered_pages = [page for page in page_contents if start_page <= page['page'] <= end_page]
    else:
        filtered_pages = page_contents

    print(f"Filtered pages: {len(filtered_pages)}")

    # Extract content from filtered pages by cleaning and then adding to the filtered sentences to be used for search
    filtered_sentences = []
    for page in filtered_pages:
        sentences = clean_data(page)
        filtered_sentences.extend(sentences)

    # Encode query and search for the most relevant sentences
    query_embedding = model.encode([query]).astype(np.float32)
    sentence_embeddings = model.encode([s[0] for s in filtered_sentences]).astype(np.float32)
    
    # Create FAISS index for the sentences
    # faiss.normalize_L2(sentence_embeddings)
    index = faiss.IndexFlatIP(sentence_embeddings.shape[1])    
    index.add(sentence_embeddings)
    distances, indices = index.search(query_embedding, top_k)

    # Check if no relevant results are found
    if distances[0][0] == np.inf:
        print("No relevant results found!")
        return []

    # Return sentence + citation
    relevant_results = [{
                        "sentence": filtered_sentences[idx][0],
                        "chapter": filtered_sentences[idx][1],
                        "page": filtered_sentences[idx][2]
                        } for idx in indices[0]]

    return relevant_results

In [None]:
book = load_book('problems_with_philosophy.txt')
pages, chapters = split_entire_book_into_pages(book) 

def ask_me_anything(query, chapter=None, page_range=None):
    """Ask the model anything about the book Problems With Philosophy

    Args:
        query (str): question to be answered
        chapter (int, optional): chatper to search. Defaults to None, whole book is searched if none.
        page_range (tuple, optional): pages to search. Defaults to None, whole book is searched if none.
    """
    print(f"Query: {query}")
    relevant_results = search_pages(query, pages, top_k=5, chapter_number=chapter, page_range=page_range)

    if not relevant_results:
        print("No relevant pages found.")
    else:
        context = " ".join([res["sentence"] for res in relevant_results])
        answer = feed_through_pipeline(context, query)
        actual_answer = captialize_first_letter(answer['answer'])
        
        print("Answer: ", actual_answer, "\n")
        print("\nCitations:")
        for res in relevant_results:
            print(f"Chapter {res['chapter']}, Page {res['page']}: \"{res['sentence']}\"")
    return actual_answer
            

In [47]:
query = "What is the first answer to the difficulty of fallacious memory?"
chapter = 2
# page_range_input = (1, 10)

ask_me_anything(query, chapter)
pass

Query: What is the first answer to the difficulty of fallacious memory?
Filtered pages: 12
Answer:  Memory has degrees of self-evidence 


Citations:
Chapter 11, Page 183: " Thus the first answer to the difficulty of fallacious memory is to say that memory has degrees of self-evidence, and that these correspond to the degrees of its trustworthi- ness, reaching a limit of perfect self-evidence and perfect trustworthiness in our memory of events which are recent and vivid."
Chapter 11, Page 182: " The case of memory, however, raises a difficulty, for it is notoriously fallacious, and thus throws doubt on the trustworthiness of intuitive judgments in general."
Chapter 11, Page 184: " It would seem that cases of fallacious memory can probably all be dealt with in this way, i."
Chapter 11, Page 183: " It would seem, however, that there are cases of very firm belief in a memory which is wholly false."
Chapter 11, Page 184: " In this case, what was immediately remembered was his repeated asse