In [118]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np
import re

## Functions for loading the book and splitting by page

In [119]:

def load_book(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
    return text

def split_into_chapters(book):
    chapters = re.split(r"CHAPTER", book)  # Split by chapters
    chapters = [chap.strip() for chap in chapters if chap.strip()]
    return chapters

def split_into_pages(chapter):
    pages = chapter.split('\n\n\n')
    new_pages = []
    for page in pages:
        splitted = page.split('\n')
        new_pages.append(splitted[0])
    return pages

def split_entire_book_into_pages(book):
    chapters = split_into_chapters(book)[2:]
    pages = []
    page_num = 9
    for chapter_number, chapter in enumerate(chapters):
        chapter_pages = split_into_pages(chapter)
        
        for page_number, page_content in enumerate(chapter_pages):
            # Create a dictionary for each page that includes chapter and page information
            pages.append({
                'chapter': chapter_number + 1,  # Chapter starts from 1 (adjusted from zero)
                'page': page_num, # Page starts from 9 (adjusted from zero)
                'content': page_content
            })
            page_num += 1

    return pages, chapters
    

In [120]:
def clean_data(page_data):
    
    #Split the data on consecutive newlines (by page)
    clean_content = page_data['content'].split('\n\n')
    
    #Account for starting a new chapter
    if len(clean_content[0]) < 5:
        clean_content = clean_content[2:]
    else:
        clean_content = clean_content[1:]
        
    #Remove any newlines and extra spaces and then rejoin the text
    clean_content = "".join(clean_content).replace("\n", "").strip()  
    clean_content = re.sub(r'\s+', ' ', clean_content)
    
    # Capture sentence-ending punctuation
    split_text = re.split(r'([.!?])', clean_content)
    split_text = [split_text[i] + split_text[i+1] for i in range(0, len(split_text)-1, 2)]

    return split_text


## Generate Embeddings

In [121]:
#Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

#Load the book and split it into pages
book = load_book('smaller_problems_with_philosophy.txt')
pages, chapters = split_entire_book_into_pages(book)
page_data = pages[0]

#Clean the data and put it into a list
all_sentences = []
for page in pages:
    all_sentences += clean_data(page)

# Generate embeddings for each segment (chapter or page)
def generate_embeddings(texts):
    return model.encode(texts)

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

embeddings = model.encode(all_sentences)
embeddings = np.array(embeddings, dtype=np.float32)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)



In [123]:
qa_pipeline = pipeline("question-answering")

def answer_question(context, question):
    return qa_pipeline(question=question, context=context)

def search_pages(query, page_contents, index, top_k=3, chapter_number=None, page_range=None):
    # Filter pages by chapter or page range if specified
    if chapter_number:
        filtered_pages = [page for page in page_contents if page['chapter'] == chapter_number]
    elif page_range:
        start_page, end_page = page_range
        filtered_pages = [page for page in page_contents if start_page <= page['page'] <= end_page]
    else:
        filtered_pages = page_contents  # No filter, return all pages
    
    # Extract content from filtered pages
    filtered_texts = [page['content'] for page in filtered_pages]

    # Encode query and search for the most relevant pages
    query_embedding = model.encode([query]).astype(np.float32)
    distances, indices = index.search(query_embedding, top_k)
    
    # Get the relevant content based on the indices returned from the search
    relevant_pages = [filtered_texts[idx] for idx in indices[0]]
    return relevant_pages

query = "What is the main theme of the chapter?"
relevant_pages = search_pages(query, pages, index, top_k=3, chapter_number=1, page_range=[19, 20])
context = " ".join(relevant_pages)
answer = answer_question(context, query)
print("Answer: ", answer['answer'])
print("Context: ", context)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


IndexError: list index out of range