In [None]:
!pip install transformers sentencepiece PyPDF2

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import PyPDF2
from google.colab import files

In [None]:
uploaded = files.upload()

In [None]:
def extract_text_from_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Get filename from uploaded dict
pdf_filename = list(uploaded.keys())[0]
text = extract_text_from_pdf(pdf_filename)

print("✅ PDF uploaded and text extracted.")
print("📄 Preview of extracted text:\n")
print(text[:1000])  # preview only

In [None]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
def summarize_text(text, max_len=150, min_len=70):
    # Split if too long
    if len(text) > 1000:
        text = text[:1000]  # truncate long docs for demo

    preprocessed = "summarize: " + text.strip().replace("\n", " ")
    inputs = tokenizer.encode(preprocessed, return_tensors="pt", truncation=True, max_length=512).to(device)

    summary_ids = model.generate(inputs, max_length=max_len, min_length=min_len,
                                 num_beams=4, length_penalty=2.0, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
summary = summarize_text(text)
print("\n Summary:\n")
print(summary)