Install necessary libraries

In [None]:
!pip install -q langchain -qU langchain-groq langchain-text-splitters tiktoken langchain_community pypdf

Import required packages

In [None]:
import os
import re
import tiktoken
import requests
from google.colab import files
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate

set GROQ API key

In [None]:
os.environ['GROQ_API_KEY'] = "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"

Initialize the model

In [None]:
model = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

Upload PDF

In [None]:
uploaded = files.upload()
pdf_path = next(iter(uploaded))

Load PDF and clean content

In [None]:
loader = PyPDFLoader(pdf_path)
data = loader.load()
data[0].page_content = re.sub(r"\n\n+", "\n", data[0].page_content)

Token estimation

In [None]:
print(f"Estimated tokens: {int(len(data[0].page_content)* 4 / 3)}")

Split into chunks

In [None]:
chunk_size = 2000
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

splits = text_splitter.split_documents(data)

Map prompt

In [None]:
map_prompt = PromptTemplate(
    template="""Write a concise summary of the following text. The summary should be a list of bullet points. The summary cannot be more than 5 bullet points. The text is:
{text}

Summary: """,
    input_variables=['text']
)

Summarize each chunk

In [None]:
from tqdm import tqdm

summaries = []
for split in tqdm(splits):
    try:
        response = model.predict(map_prompt.format(text=split.page_content))
        summaries.append(response)
    except Exception:
        summaries.append("Error")

Group summarization for reduction

In [None]:
def group_summaries(summaries, max_summaries):
  groups = []
  current_group = []
  for summary in summaries:
    current_group.append(summary)
    if len(current_group) >= max_summaries:
      groups.append(current_group)
      current_group = []
  if current_group:
    groups.append(current_group)
  return groups

groups = group_summaries(summaries, 10)

Combine prompt for mid level summaries

In [None]:
combine_prompt = PromptTemplate(
    template= """The following is set of bullet-point summaries:
{docs}
Take these and distill it into a consolidated bullet-point summary of the main themes. Remove the bullet points that are not relevant to the whole text. The consolidated summary cannot be more than 7 bullet points.
Helpful Answer: """,
    input_variables=['docs']
)

Combine group summaries iteratively

In [None]:
while len(groups) > 1:
  new_summaries = []
  for group in groups:
    response = model.invoke(combine_prompt.format(docs="\n".join(group)))
    new_summaries.append(response.content)
  groups = group_summaries(new_summaries, 10)

In [None]:
groups[0][0]

Final reduction prompt

In [None]:
reduce_prompt = PromptTemplate(
    template="""
You are an expert summarizer.

Below are multiple summaries of different sections of a document. Please combine them into a single, cohesive paragraph summary. Make sure the final summary is **15 complete sentences**, written in a fluent and readable academic tone. Do not use bullet points.

Summaries:
{docs}

Final Summary:
""",
    input_variables=['docs']
)
response = model.invoke(reduce_prompt.format(docs="\n".join(groups[0])))
final_summary = response.content

Final summary

In [None]:
import textwrap
print("\nFinal Summary:\n")
print(textwrap.fill(final_summary, width=200))