In [1]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6


In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
import os
import fitz  # PyMuPDF for PDF handling
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
from heapq import nlargest
import nltk
from transformers import BartForConditionalGeneration, BartTokenizer

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Folder where PDF files are located
pdf_folder = '/content/drive/MyDrive/Pdf'

# Output file for summaries
output_file = 'summaries.txt'

# Function to read PDF files and extract text from all pages
def read_pdf_files(folder_path):
    texts = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            filepath = os.path.join(folder_path, filename)
            try:
                doc = fitz.open(filepath)
                text = ''
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    text += page.get_text()
                if text.strip():  # Ensure extracted text is not empty
                    texts.append(text)
                    filenames.append(filename)
                else:
                    print(f"Warning: {filename} does not contain readable text.")
            except Exception as e:
                print(f"Error reading {filename}: {str(e)}")
    return texts, filenames

# Function to summarize text using NLTK
def summarize_text_nltk(text, max_words=None, num_sentences=5):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    word_frequencies = defaultdict(int)

    for word in words:
        if word not in stop_words and word.isalnum():
            word_frequencies[word] += 1

    sentence_list = sent_tokenize(text)
    sentence_scores = defaultdict(int)

    for sentence in sentence_list:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                sentence_scores[sentence] += word_frequencies[word]

    if max_words:
        current_words = 0
        summary_sentences = []
        for sentence in sentence_list:
            if current_words < max_words:
                summary_sentences.append(sentence)
                current_words += len(word_tokenize(sentence))
            else:
                break
    else:
        summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)
    return summary

# Function to summarize text using BART model from Transformers
def summarize_text_bart(text, max_length=150):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

    inputs = tokenizer([text], max_length=max_length, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=150, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Function to calculate time-accurancy summary length
def time_accurancy_summary(text, words_per_minute, minutes):
    target_words = words_per_minute * minutes
    sentence_list = sent_tokenize(text)
    current_words = 0
    summary_sentences = []

    for sentence in sentence_list:
        if current_words < target_words:
            summary_sentences.append(sentence)
            current_words += len(word_tokenize(sentence))
        else:
            break

    summary = ' '.join(summary_sentences)
    return summary

# Function to summarize each page using NLTK
def summarize_each_page_nltk(text):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)
    summaries = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
        summary = ' '.join(filtered_words)
        summaries.append(summary)

    return summaries

# Read PDF files and extract texts
texts, filenames = read_pdf_files(pdf_folder)

# Initialize dictionary to store time accuracies
time_accuracies = {}

# Open output file to write summaries
with open(output_file, 'w', encoding='utf-8') as f:
    # Summarize each PDF file
    for count, (text, filename) in enumerate(zip(texts, filenames), start=1):
        f.write(f"Count of PDF file: {count}\n")
        f.write(f"Filename: {filename}\n\n")
        f.write(f"Full text from {filename}:\n")
        f.write(text + '\n\n')

        # Determine if the document is a book (over 200 pages)
        num_pages = len(fitz.open(os.path.join(pdf_folder, filename)))
        if num_pages > 200:
            f.write(f"\nSummarizing all pages for {filename} using NLTK:\n")
            nltk_summary = summarize_text_nltk(text)
            f.write(nltk_summary + '\n\n')  # Using NLTK for summarization

            # Summarize using BART model
            f.write(f"\nSummarizing all pages for {filename} using BART model:\n")
            bart_summary = summarize_text_bart(text)
            f.write(bart_summary + '\n\n')

            # Time-accurancy summarization example (30 minutes at 200 words per minute)
            words_per_minute = 200
            minutes = 30
            f.write(f"\nTime-accurancy Summarization for {filename}:\n")
            time_summary = time_accurancy_summary(text, words_per_minute, minutes)
            f.write(time_summary + '\n\n')

            # Summarize each page using NLTK
            f.write(f"\nSummarizing each page for {filename} using NLTK:\n")
            page_summaries = summarize_each_page_nltk(text)
            for i, summary in enumerate(page_summaries, start=1):
                f.write(f"Page {i} summary: {summary}\n")

            # Calculate accuracy for time-accurancy summary method
            word_count = len(word_tokenize(text))
            time_accuracy = len(word_tokenize(time_summary)) / word_count * 100

            # Store time accuracy in dictionary
            time_accuracies[filename] = time_accuracy

            # Print separator
            f.write("\n--------------------------------------------------\n\n")

        else:
            f.write(f"\n{filename} is less than 200 pages, skipping summarization.\n")
            f.write("\n--------------------------------------------------\n\n")

    # Write total count of PDF files processed
    f.write(f"Total PDF files processed: {len(filenames)}\n\n")

# Print overall accuracies for each PDF file
with open(output_file, 'a', encoding='utf-8') as f:
    f.write("Overall Accuracies:\n")
    for filename in filenames:
        f.write(f"{filename}:\n")
        if filename in time_accuracies:
            f.write(f"Time-accurancy Summary: {time_accuracies[filename]:.2f}%\n")
        f.write("\n--------------------------------------------------\n\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


