# GPT with open source models

In [1]:
import os
import glob
import logging
from PyPDF2 import PdfReader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
from tqdm import tqdm
from langchain_core.documents import Document
from fpdf import FPDF

In [2]:
def initialize_embeddings(model_name):
    """Initialize the HuggingFace embedding model."""
    return HuggingFaceEmbeddings(model_name=model_name)


In [3]:


def create_embeddings(embeddings_dir, model_name):
    """Create a FAISS database from a single PDF file (book), with citation metadata for each page."""
    hf_embed = initialize_embeddings(model_name)

    # Load the single PDF file
    loader = PyPDFLoader('crime-and-punishment.pdf')
    pages = loader.load_and_split()

    documents = []

    for idx, page in enumerate(tqdm(pages, desc="Processing pages", ncols=100)):
        # Add metadata with the page number
        metadata = {
            "page": idx + 1 
        }
        documents.append(Document(page_content=page.page_content, metadata=metadata))

    # Create the FAISS database from the documents and embeddings
    print("Creating FAISS database from pages...")
    db = FAISS.from_documents(documents, embedding=hf_embed)

    # Save the FAISS database locally
    db.save_local(embeddings_dir)
    print("Creating embeddings with metadata completed.")
    return db




In [4]:
def load_embeddings(embeddings_dir, model_name):
    """Load the existing FAISS database, allowing dangerous deserialization."""
    hf_embed = initialize_embeddings(model_name)
    try:
        db = FAISS.load_local(embeddings_dir, hf_embed, allow_dangerous_deserialization=True)
        print("Loaded the embeddings with metadata.")
        return db
    except Exception as e:
        print(f"Failed to load embeddings: {e}")
        raise


In [5]:
def initialize_vector_store(embeddings_dir, model_name):
    """Initialize or load the FAISS database."""
    if not os.path.exists(embeddings_dir):
        print("Creating new embeddings from PDF files...")
        return create_embeddings(embeddings_dir, model_name)
    else:
        print("Loading existing embeddings...")
        return load_embeddings(embeddings_dir, model_name)


In [6]:
def initialize_qa_chain(llm, prompt_template):
    """Initialize the QA chain with the provided language model and prompt template."""
    return load_qa_chain(llm=llm, chain_type="stuff", prompt=prompt_template)


In [7]:
%%time
embeddings_dir = os.path.join(os.getcwd(), "Crime_and_Punishment")
model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "BAAI/bge-small-en"
# model_name = "thenlper/gte-base"

# Initialize vector store (create or load)
print("Initializing vector store...")
db = initialize_vector_store(embeddings_dir, model_name)


Initializing vector store...
Loading existing embeddings...


  return HuggingFaceEmbeddings(model_name=model_name)
  from .autonotebook import tqdm as notebook_tqdm


Loaded the embeddings with metadata.
CPU times: total: 9.41 s
Wall time: 23.2 s


In [8]:
# db.similarity_search('who is Fyodor Dostoevsky')

In [8]:

instruction_text = (""" You are an expert literature guide helping students understand the novel Crime and Punishment by Fyodor Dostoevsky. Your task is to summarize the provided text in a clear, concise, and engaging manner, keeping in mind the following points:
Main Ideas: Clearly outline the key events, characters, and philosophical themes in the text. Ensure the summary highlights critical aspects like internal conflicts, moral dilemmas, and character motivations.
Tone and Style: Maintain the depth and seriousness of Dostoevsky’s narrative while making it accessible for students. Use simple but precise language to explain complex ideas.
Key Themes: Highlight important themes, such as morality, justice, redemption, suffering, and the social issues of poverty and isolation. Relate these themes to the actions and choices of the characters.
Relevance to Students: Provide insights into how the characters' struggles and themes connect to broader human experiences, encouraging reflection and engagement.
Symbolism and Analysis: If applicable, briefly explain significant symbols or imagery (e.g., dreams, the cityscape) and their contribution to the novel’s message.
Your summary length should be 300 words easy to read, and help students grasp both the plot and the deeper meaning of the text. Avoid unnecessary details or overly complex language. Instead, focus on creating a summary that sparks curiosity and encourages further exploration of the novel"""
            )

In [9]:
qa_prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=instruction_text + "\n\nContext: {context}\n\nQuestion: {question}"
)

In [10]:
qa_prompt_template

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template=" You are an expert literature guide helping students understand the novel Crime and Punishment by Fyodor Dostoevsky. Your task is to summarize the provided text in a clear, concise, and engaging manner, keeping in mind the following points:\nMain Ideas: Clearly outline the key events, characters, and philosophical themes in the text. Ensure the summary highlights critical aspects like internal conflicts, moral dilemmas, and character motivations.\nTone and Style: Maintain the depth and seriousness of Dostoevsky’s narrative while making it accessible for students. Use simple but precise language to explain complex ideas.\nKey Themes: Highlight important themes, such as morality, justice, redemption, suffering, and the social issues of poverty and isolation. Relate these themes to the actions and choices of the characters.\nRelevance to Students: Provide insights into how the character

In [11]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = ""

# Instantiate the model
llm = ChatOpenAI(
    model="gpt-4o-mini",       
    temperature=0.7,   
    max_tokens=None,    
    timeout=None,      
    api_key=api_key, #os.getenv("OPENAI_API_KEY"), 
 
)


  llm = ChatOpenAI(


In [12]:
chain = initialize_qa_chain(llm, qa_prompt_template)


stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  return load_qa_chain(llm=llm, chain_type="stuff", prompt=prompt_template)


In [16]:
import os
from fpdf import FPDF
import logging

# Constants
MAX_PAGES = 20  # Maximum allowed PDF pages
WORDS_PER_PAGE = 300  # Approximate words per page
FONT_PATH = "./arial-unicode-ms.ttf"  # Path to the custom font file

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def ask_question(db, chain, page_range, num_references=1):
    """Ask a question to summarize content for a specific page range."""
    question = f"Summarize the content of pages {page_range[0]} to {page_range[1]} of the book."

    docs_db = db.similarity_search(question, k=num_references)
    if not docs_db:
        logging.warning(f"No similar documents found for pages {page_range}.")
        return f"No summary available for pages {page_range[0]} to {page_range[1]}."

    context = "\n\n".join(
        f"Page: {doc.metadata.get('page', 'N/A')}\n{doc.page_content}"
        for doc in docs_db
    )

    response = chain({
        "input_documents": docs_db,
        "question": question,
        "context": context,
        "existing_answer": ""
    }, return_only_outputs=True)

    if not response.get('output_text') or response['output_text'] == "The answer is not in the knowledge base.":
        logging.warning(f"Failed to summarize pages {page_range}.")
        return f"No summary available for pages {page_range[0]} to {page_range[1]}."

    return response['output_text']


def truncate_or_split_summaries(summaries):
    """Ensure summaries fit within the allowed page limit by truncating or summarizing further."""
    total_words = sum(len(summary.split()) for summary in summaries)
    max_words = MAX_PAGES * WORDS_PER_PAGE

    if total_words > max_words:
        logging.info("Summaries exceed maximum allowed words. Adjusting...")
        adjusted_summaries = []
        for summary in summaries:
            truncated = " ".join(summary.split()[:max_words // len(summaries)])
            adjusted_summaries.append(truncated)
        return adjusted_summaries

    return summaries


def configure_font(pdf):
    """Configure the PDF font with fallback."""
    if os.path.exists(FONT_PATH):
        try:
            pdf.add_font('ArialUnicode', '', FONT_PATH, uni=True)
            pdf.add_font('ArialUnicode', 'B', FONT_PATH, uni=True)
            pdf.set_font('ArialUnicode', size=12)
            logging.info("Custom font loaded successfully.")
        except Exception as e:
            logging.error(f"Font loading failed: {e}. Falling back to default.")
            pdf.set_font('Arial', size=12)
    else:
        logging.warning("Custom font file not found. Using default font.")
        pdf.set_font('Arial', size=12)


def write_answers_to_pdf(answers, output_filename):
    """Write the answers into a PDF file, ensuring the output fits within 20 pages."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=16)
    pdf.add_page()

    configure_font(pdf)
    pdf.set_font("ArialUnicode", style='B', size=16)
    pdf.cell(200, 10, txt="Summary of Crime and Punishment", ln=True, align='C')
    pdf.ln(10)

    total_pages = 1  # Start with the first page already added

    for idx, answer in enumerate(answers, start=1):
        if total_pages > MAX_PAGES:
            break

        pdf.set_font("ArialUnicode", style='B', size=14)
        pdf.cell(0, 10, ln=True)
        pdf.ln(5)

        pdf.set_font("ArialUnicode", size=11)
        clean_answer = answer.encode('latin1', 'replace').decode('latin1')
        pdf.multi_cell(0, 10, clean_answer)
        pdf.ln(10)

        if pdf.get_y() > 250:  # Add a new page if nearing the bottom
            pdf.add_page()
            configure_font(pdf)
            total_pages += 1

    pdf.output(output_filename)
    logging.info(f"PDF saved as '{output_filename}'")


def calculate_page_ranges(total_pages, chunk_size):
    """Calculate page ranges for chunked summarization."""
    return [(start, min(start + chunk_size - 1, total_pages)) for start in range(1, total_pages + 1, chunk_size)]


def generate_summaries(db, chain, total_pages=767, page_chunk=40):
    """Generate summaries for the entire book in chunks."""
    page_ranges = calculate_page_ranges(total_pages, page_chunk)
    summaries = []

    for idx, page_range in enumerate(page_ranges):
        logging.info(f"Summarizing pages {page_range[0]} to {page_range[1]} ({idx + 1}/{len(page_ranges)})...")
        summary = ask_question(db, chain, page_range)
        summaries.append(summary)

    summaries = truncate_or_split_summaries(summaries)
    write_answers_to_pdf(summaries, "Summarized_Crime Report.pdf")


def main():
    """Main function to execute the summarization and PDF generation."""

    generate_summaries(db, chain)


if __name__ == "__main__":
    main()


2024-11-21 21:51:20,497 - INFO - Summarizing pages 1 to 40 (1/20)...
2024-11-21 21:51:34,547 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 21:51:34,551 - INFO - Summarizing pages 41 to 80 (2/20)...
2024-11-21 21:51:39,398 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 21:51:39,404 - INFO - Summarizing pages 81 to 120 (3/20)...
2024-11-21 21:51:44,870 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 21:51:44,876 - INFO - Summarizing pages 121 to 160 (4/20)...
2024-11-21 21:51:50,488 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 21:51:50,493 - INFO - Summarizing pages 161 to 200 (5/20)...
2024-11-21 21:52:00,472 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-21 21:52:00,477 - INFO - Summarizing pages 201 to 240 (6/20)...
2024-11-2