In [3]:
!pip install langchain google-cloud-aiplatform google-auth PyPDF2 ratelimit backoff reportlab >/dev/null

In [4]:
import re
import vertexai
import warnings
import backoff
from langchain.chat_models import ChatVertexAI
import PyPDF2
import ratelimit
from google.api_core import exceptions
from tqdm import tqdm
from vertexai.language_models import TextGenerationModel
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph
from langchain.chat_models import ChatVertexAI
import os

In [6]:
from google.colab import auth
auth.authenticate_user()

In [7]:
import vertexai

PROJECT_ID = "your-project-id-here"  # @param {type:"string"}
vertexai.init(project=PROJECT_ID, location="us-central1")

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
generation_model = TextGenerationModel.from_pretrained("text-bison@002")

In [10]:
def multiple_replace(string, rep_dict):
    pattern = re.compile("|".join([re.escape(k) for k in sorted(rep_dict,key=len,reverse=True)]), flags=re.DOTALL)
    return pattern.sub(lambda x: rep_dict[x.group(0)], string)

In [186]:
def create_pdf(input_text, output_filename):
    pdf = SimpleDocTemplate(output_filename, pagesize=letter)
    styles = getSampleStyleSheet()
    justified_style = ParagraphStyle(
        "Justified",
        parent=styles["Normal"],
        alignment=0,
        spaceBefore=12,
        spaceAfter=12,
    )
    story = []
    # paragraphs = input_text.split('\n')
    for paragraph in input_text:
        story.append(Paragraph(paragraph, justified_style))
    pdf.build(story)


In [12]:
book_path = input('input book path here (local): ')
reader = PyPDF2.PdfReader(book_path)
pages = reader.pages
page_texts = []
replacements = {"Crime and Punishment":"", "Free eBooks at Planet eBook.com":""}
concatenated_text = ""

for i in range(5,len(pages)):
    text = multiple_replace(pages[i].extract_text().strip(),replacements)
    page_texts.append(text)
    concatenated_text += text

In [13]:
llm = ChatVertexAI(temperature=0)
num_tokens = llm.get_num_tokens(" ".join(page_texts[0:10]))
print (f"Our prompt has {num_tokens} tokens")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3940 > 1024). Running this sequence through the model will result in indexing errors


Our prompt has 3940 tokens


In [79]:
sum=0
for p in page_texts[1:21]:
  sum+=llm.get_num_tokens(p)

print(f'average tokens on a page: {sum/20}')

average tokens on a page: 438.7


In [23]:
print(f"There are {len(concatenated_text)} characters in the pdf")

There are 1147238 characters in the pdf


In [24]:
CALL_LIMIT = 50
ONE_MINUTE = 60
FIVE_MINUTE = 10 * ONE_MINUTE
def backoff_hdlr(details):
    print(
        "Backing off {} seconds after {} tries".format(
            details["wait"], details["tries"]
        )
    )
@backoff.on_exception(
    backoff.expo,
    (
            exceptions.ResourceExhausted,
            ratelimit.RateLimitException,
    ),  # Exceptions to retry on
    max_time=FIVE_MINUTE,
    on_backoff=backoff_hdlr,
)
@ratelimit.limits(
    calls=CALL_LIMIT, period=ONE_MINUTE
)
def model_with_limit_and_backoff(**kwargs):
    return generation_model.predict(**kwargs)

In [31]:
initial_prompt_template = """
    This task is to summarize a book.
    Write a concise summary of the following text. The text is being taken in chunks from the book 'crime and punishment'.

    {text}

    SUMMARY:
"""

In [32]:
CHUNK_SIZE = 2
initial_summary = []

for i in tqdm(range(len(page_texts))):
    pages_to_merge = [x for x in range(i, i + CHUNK_SIZE) if x < len(page_texts)]
    extracted_texts = [page_texts[x] for x in pages_to_merge]
    text = "\n".join(extracted_texts)
    prompt = initial_prompt_template.format(text=text)
    summary = model_with_limit_and_backoff(prompt=prompt, max_output_tokens=1024).text
    initial_summary.append(summary)
    if pages_to_merge[-1] == len(reader.pages):
        break

100%|██████████| 762/762 [29:40<00:00,  2.34s/it]


In [80]:
s=0
for x in initial_summary:
  s+=llm.get_num_tokens(x)
print(f'total tokens in initial generated summaries: {s}')
print(f'possible number of pages with these many tokens: {s/450}')

total tokens in initial generated summaries: 98259
possible number of pages with these many tokens: 218.35333333333332


In [72]:
s=0
for x in initial_summary[75:90]:
  s+=llm.get_num_tokens(x)

print(f'number of tokens in 15 summaries: {s}')

number of tokens in 15 initial summaries: 1939


In [129]:
chunk_size = 30
overlap = 2

result_chunks = [" ".join(initial_summary[i:i + chunk_size]) for i in range(0, len(initial_summary), chunk_size - overlap)]


In [132]:
prompt_template = """
    This task is to summarize the already generated summaries of chunks the book 'crime and punishment'.
    Write a concise summary of the following text.

    {text}

    CONCISE SUMMARY:
"""

In [133]:
summary_list = []
for s in tqdm(result_chunks):
    prompt = prompt_template.format(text=s)
    summary = model_with_limit_and_backoff(prompt=prompt, max_output_tokens=1000).text
    summary_list.append(summary)


100%|██████████| 28/28 [02:22<00:00,  5.09s/it]


In [134]:
s=0
for x in summary_list:
  s+=llm.get_num_tokens(x)
print(f'total tokens in finally generated summaries: {s}')
print(f'possible number of pages with these many tokens: {s/440}')

total tokens in finally generated summaries: 7865
possible number of pages with these many tokens: 17.875


In [192]:
final_summary = " ".join(summary_list)
final_summary = final_summary.split()
final_summary = [final_summary[i:i+250] for i in range(0, len(final_summary), 250)]
final_summary = [" ".join(x) for x in final_summary]

In [193]:
create_pdf(final_summary, "summary.pdf")