# Comparision CPU

In [1]:
import os
import pandas as pd
import time
import re
from language_tool_python import LanguageTool
from transformers import AutoTokenizer, BartForConditionalGeneration
import PyPDF2

In [2]:
paper_model_path = "../iter_trained_model"
text_model_path = "facebook/bart-large-cnn"
tokenizer_path = "facebook/bart-large-cnn"

directory = "../Test Paper Data/" 
output_file = "../comparision_results.csv"

In [3]:
def remove_section_headers(text):
    # Define the regular expression pattern to match sequences like "1. xy"
    section_patterns = [
        # Digits
        r'\b\d+\.\s+\w+\b',
        # Roman Numerals
        r'\b[IVXLCDM]+\.\s+\w+\b'
    ]

    # Combine patterns into a single regex pattern
    pattern = '|'.join(section_patterns)

    # Use re.sub to replace matched sequences with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

def exclude_header_footer(text):
    # Split text into sentences using regex
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    
    # No of sentences to exclude in header
    num_sentences_to_exclude_header = int( len(sentences) * 0.05 )
    # No of sentences to exclude in footer
    num_sentences_to_exclude_footer = int(len(sentences) * 0.1 )
    
    # Exclude the header and footer  of page
    excluded_sentences = sentences[num_sentences_to_exclude_header:-num_sentences_to_exclude_footer]
    
    # Rejoin the remaining sentences
    remaining_text = ' '.join(excluded_sentences)
    
    return remaining_text

def remove_citations(text):
    # Define patterns for common citation formats
    citation_patterns = [
        # Match citations like "[1]", "[12]", "[123]", etc.
        r'\[\d+\]',
        
        # Match citations like "(Author, Year)", "(Author et al., Year)", "(Author Year)", etc.
        r'\(\w+(?: et al.)?, \d{4}\)',
        
        # Match citations like "[Author et al., Year]", "[Author, Year]", etc.
        r'\[\w+(?: et al.)?, \d{4}\]'
    ]
    
    # Combine patterns into a single regex pattern
    combined_pattern = '|'.join(citation_patterns)
    
    # Remove citations from the text using the regex pattern
    cleaned_text = re.sub(combined_pattern, '', text)
    
    return cleaned_text

def replace_multiple_whitespace(text):
    # Define the regular expression pattern to match multiple whitespace characters
    pattern = r'\s+'
    
    # Use re.sub to replace multiple whitespace characters with a single whitespace character
    cleaned_text = re.sub(pattern, ' ', text)
    
    return cleaned_text

def fix_grammar(text):
    tool = LanguageTool('en-US')

    # Check for grammatical errors
    matches = tool.check(text)
    # Fix grammatical errors
    corrected_text = tool.correct(text)

    return corrected_text

def preprocess_text(text):
    # replace new line character with space
    text = text.replace("\n"," ")

    # removing extra whitespaces
    text = replace_multiple_whitespace(text)

    # remove citations
    text = remove_citations(text)

    #remove section heading
    text = remove_section_headers(text)

    return text

def get_paper_content(pdf_file_path):
    with open(pdf_file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        full_text = []

        for page_number in range(num_pages):
            page = pdf_reader.pages[page_number]
            page_content = page.extract_text()
            cleaned_content = preprocess_text(page_content)
            cleaned_content = exclude_header_footer(cleaned_content)
            full_text.append(cleaned_content)

        final_text = '\n'.join(full_text)

        return final_text

def generate_summary_paper(text, min_summary_length=128, max_summary_length=1024, overlap_percentage=35):
    text = preprocess_text(text)

    text = fix_grammar(text)
    
    model = BartForConditionalGeneration.from_pretrained(paper_model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    tokenized_text = tokenizer.encode(text, return_tensors="pt", truncation=False)
    # Calculate overlap size
    max_tokens = 1024
    overlap_size = int(max_tokens * overlap_percentage / 100)

    # Generate summaries with overlapping chunks
    summaries = []
    for i in range(0, tokenized_text.size(1) - max_tokens + 1, max_tokens - overlap_size):
        start = i
        end = min(i + max_tokens, tokenized_text.size(1))
        chunk = tokenized_text[:, start:end]
        summary_ids = model.generate(chunk, min_length=min_summary_length, max_length=max_summary_length, num_beams=10, early_stopping=True, length_penalty=0.8, repetition_penalty=1.5)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Concatenate summaries to create the final summary
    final_summary = " ".join(summaries)
    final_summary = final_summary.replace("\n"," ")
    return fix_grammar(final_summary)

In [6]:
def process_file(file_path):
    paper_content = get_paper_content(file_path)
    generate_summary_paper(paper_content)

def process_files_in_directory(directory):
    results = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        start_time = time.time()
        process_file(file_path)
        end_time = time.time()
        processing_time = end_time - start_time
        results.append((filename, processing_time))
        print(filename + " Processed")
        save_results_to_csv(results, output_file)
    return results

def save_results_to_csv(results, output_file):
    df = pd.DataFrame(results, columns=['Filename', 'Processing Time (s)'])
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [None]:
if os.path.isdir(directory):
    print(f"Processing files in directory: {directory}")
    results = process_files_in_directory(directory)
else:
    print(f"Error: {directory} is not a valid directory.")

Processing files in directory: ../Test Paper Data/


Token indices sequence length is longer than the specified maximum sequence length for this model (7979 > 1024). Running this sequence through the model will result in indexing errors


10005108.pdf Processed
Results saved to ../comparision_results.csv


Token indices sequence length is longer than the specified maximum sequence length for this model (5238 > 1024). Running this sequence through the model will result in indexing errors


10018187.pdf Processed
Results saved to ../comparision_results.csv


Token indices sequence length is longer than the specified maximum sequence length for this model (11011 > 1024). Running this sequence through the model will result in indexing errors


10032547.pdf Processed
Results saved to ../comparision_results.csv


Token indices sequence length is longer than the specified maximum sequence length for this model (11545 > 1024). Running this sequence through the model will result in indexing errors


10044683.pdf Processed
Results saved to ../comparision_results.csv


Token indices sequence length is longer than the specified maximum sequence length for this model (7761 > 1024). Running this sequence through the model will result in indexing errors


10054384.pdf Processed
Results saved to ../comparision_results.csv


Token indices sequence length is longer than the specified maximum sequence length for this model (18643 > 1024). Running this sequence through the model will result in indexing errors
