In [None]:
%pip install langchain langchain_openai pypdf langchain-community --quiet --upgrade

In [17]:
import getpass
import os

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")

In [18]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
import requests

In [19]:
# Download the PDF from blob storage:
url = (
    "https://storage.googleapis.com/strapi_cms_assets/principles_of_marketing_book.pdf"
)
response = requests.get(url)
with open("principles_of_marketing_book.pdf", "wb") as f:
    f.write(response.content)

In [20]:
# Create a text splitter, load the PDF and split it:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
loader = PyPDFLoader("principles_of_marketing_book.pdf")
pages = loader.load_and_split()

In [21]:
pages[0]

Document(metadata={'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'Pressbooks 5.18.1', 'creationdate': '2021-02-19T19:06:37+00:00', 'moddate': '2021-02-19T19:06:37+00:00', 'title': 'Principles of Marketing', 'gts_pdfxversion': 'PDF/X-4', 'trapped': '/False', 'source': 'principles_of_marketing_book.pdf', 'total_pages': 497, 'page': 0, 'page_label': '1'}, page_content='Principles of Marketing')

In [28]:
import PyPDF2
import tiktoken

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def count_tokens(text, model="gpt-4o-mini"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

# Fő rész
pdf_path = "principles_of_marketing_book.pdf"
text = extract_text_from_pdf(pdf_path)
num_tokens = count_tokens(text)

print(f"The PDF contains {num_tokens} token.")

The PDF contains 229005 token.


In [22]:
llm = ChatOpenAI(model="gpt-4o-mini")

```python
def load_summarize_chain(
    llm: BaseLanguageModel,
    chain_type: str = "stuff",
    verbose: bool | None = None,
    **kwargs: Any
) -> BaseCombineDocumentsChain
```

The chain type can be either:

- stuff: Combine all of the documents into a single LLM call and then summarize.
- map_reduce: Summarize each document individually and then combine the summaries.
- refine: Summarize each document sequentially, refining the previous summary with each new document.

In [23]:
summarization_chain = load_summarize_chain(llm=llm, chain_type="map_reduce")

In [None]:
# Summarize the document using the summarization chain:
result = summarization_chain.invoke(pages)

In [29]:
# Doing a smaller summarization using the same chain:
smaller_amount_of_pages = pages[:10]
second_result = summarization_chain.invoke(smaller_amount_of_pages)

In [30]:
second_result

{'input_documents': [Document(metadata={'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'Pressbooks 5.18.1', 'creationdate': '2021-02-19T19:06:37+00:00', 'moddate': '2021-02-19T19:06:37+00:00', 'title': 'Principles of Marketing', 'gts_pdfxversion': 'PDF/X-4', 'trapped': '/False', 'source': 'principles_of_marketing_book.pdf', 'total_pages': 497, 'page': 0, 'page_label': '1'}, page_content='Principles of Marketing'),
  Document(metadata={'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'Pressbooks 5.18.1', 'creationdate': '2021-02-19T19:06:37+00:00', 'moddate': '2021-02-19T19:06:37+00:00', 'title': 'Principles of Marketing', 'gts_pdfxversion': 'PDF/X-4', 'trapped': '/False', 'source': 'principles_of_marketing_book.pdf', 'total_pages': 497, 'page': 1, 'page_label': '2'}, page_content='Principles of Marketing\n[Author remo ved at request of original publisher]\nUNIVERSITY OF MINNESO TA LIBRARIES PUBLISHING EDITION, 2015. THIS EDITION AD APTED\nFR OM A WORK ORIGINALL Y PR

# Doing a custom `MapReduceChain` to generate a summary in Hungarian:

In [40]:
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter


map_template_string = """Give the following pages of a marketing book. Generate a summary in Hungarian:
Pages:
{pages}

Summary:
"""

reduce_template_string = """Given the following Hungarian summaries of pages of a marketing book, generate a high level description of the book in Hungarian:
Summaries:
{summaries}
"""

# Prompt to use in map and reduce stages
MAP_PROMPT = PromptTemplate(input_variables=["pages"], template=map_template_string)
REDUCE_PROMPT = PromptTemplate(
    input_variables=["summaries"], template=reduce_template_string
)

# LLM to use in map and reduce stages
map_llm_chain = LLMChain(llm=llm, prompt=MAP_PROMPT)
reduce_llm_chain = LLMChain(llm=llm, prompt=REDUCE_PROMPT)

# Takes a list of documents and combines them into a single string
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_llm_chain,
    document_variable_name="summaries",
)

# Combining documents by mapping a chain over them, then combining results with reduce chain
combine_documents = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_llm_chain,
    # Reduce chain
    # The variable name in the llm_chain to put the documents in:
    document_variable_name="pages",
    combine_document_chain=combine_documents_chain,
)

map_reduce = MapReduceChain(
    combine_documents_chain=combine_documents,
    text_splitter=CharacterTextSplitter(
        separator="\n##\n", chunk_size=100, chunk_overlap=0
    ),
)

In [36]:
map_reduce_result = map_reduce.invoke(
    {"input_text": "\n".join([doc.page_content for doc in pages])[0:100]}
)

In [37]:
map_reduce_result

{'output_text': 'A "Marketing Elvei" című könyv átfogó áttekintést nyújt a marketing alapjairól, amely elengedhetetlen a vállalatok sikeres termék- és szolgáltatásértékesítéséhez. Részletesen elemzi a piackutatás szerepét, a célcsoportok azonosítását, valamint a marketing-mix négy fő elemét: terméket, árat, helyszínt és promóciót. A könyv hangsúlyozza a fogyasztói magatartás megértésének fontosságát, továbbá a márkaépítés jelentőségét a versenyképes piacon. Praktikus tanácsokat ad a hatékony marketingkommunikációhoz és a digitális marketing lehetőségeinek kihasználásához, így ideális útmutató mindazok számára, akik szeretnék elmélyíteni marketingismereteiket és fejleszteni stratégiáikat.'}