# Comparision GPU

In [1]:
!pip install pandas torch PyPDF2 language-tool-python transformers

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting language-tool-python
  Downloading language_tool_python-2.7.1-py3-none-any.whl.metadata (12 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: PyPDF2, language-tool-python
Successfully installed PyPDF2-3.0.1 language-tool-python-2.7.1


In [2]:
import os
import pandas as pd
import time
import PyPDF2
import re
import torch
import gc
import threading
from language_tool_python import LanguageTool
from transformers import AutoTokenizer, BartForConditionalGeneration

In [3]:
paper_model_path = "/kaggle/input/iter-papers-trained/other/gen1/1"
tokenizer_path = "facebook/bart-large-cnn"

directory = "/kaggle/input/test-paper-data"
output_file = "/kaggle/working/comparision_results_gpu.csv"

device = "cuda"
print(torch.cuda.is_available())

True


In [4]:
def remove_section_headers(text):
    # Define the regular expression pattern to match sequences like "1. xy"
    section_patterns = [
        # Digits
        r'\b\d+\.\s+\w+\b',
        # Roman Numerals
        r'\b[IVXLCDM]+\.\s+\w+\b'
    ]

    # Combine patterns into a single regex pattern
    pattern = '|'.join(section_patterns)

    # Use re.sub to replace matched sequences with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

def exclude_header_footer(text):
    # Split text into sentences using regex
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

    # No of sentences to exclude in header
    num_sentences_to_exclude_header = int( len(sentences) * 0.05 )
    # No of sentences to exclude in footer
    num_sentences_to_exclude_footer = int(len(sentences) * 0.1 )

    # Exclude the header and footer  of page
    excluded_sentences = sentences[num_sentences_to_exclude_header:-num_sentences_to_exclude_footer]

    # Rejoin the remaining sentences
    remaining_text = ' '.join(excluded_sentences)

    return remaining_text

def remove_citations(text):
    # Define patterns for common citation formats
    citation_patterns = [
        # Match citations like "[1]", "[12]", "[123]", etc.
        r'\[\d+\]',

        # Match citations like "(Author, Year)", "(Author et al., Year)", "(Author Year)", etc.
        r'\(\w+(?: et al.)?, \d{4}\)',

        # Match citations like "[Author et al., Year]", "[Author, Year]", etc.
        r'\[\w+(?: et al.)?, \d{4}\]'
    ]

    # Combine patterns into a single regex pattern
    combined_pattern = '|'.join(citation_patterns)

    # Remove citations from the text using the regex pattern
    cleaned_text = re.sub(combined_pattern, '', text)

    return cleaned_text

def replace_multiple_whitespace(text):
    # Define the regular expression pattern to match multiple whitespace characters
    pattern = r'\s+'

    # Use re.sub to replace multiple whitespace characters with a single whitespace character
    cleaned_text = re.sub(pattern, ' ', text)

    return cleaned_text

def fix_grammar(text):
    tool = LanguageTool('en-US')

    # Check for grammatical errors
    matches = tool.check(text)
    # Fix grammatical errors
    corrected_text = tool.correct(text)

    return corrected_text

def preprocess_text(text):
    # replace new line character with space
    text = text.replace("\n"," ")

    # removing extra whitespaces
    text = replace_multiple_whitespace(text)

    # remove citations
    text = remove_citations(text)

    #remove section heading
    text = remove_section_headers(text)

    return text

def get_paper_content(pdf_file_path):
    with open(pdf_file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        full_text = []

        for page_number in range(num_pages):
            page = pdf_reader.pages[page_number]
            page_content = page.extract_text()
            cleaned_content = preprocess_text(page_content)
            cleaned_content = exclude_header_footer(cleaned_content)
            full_text.append(cleaned_content)

        final_text = '\n'.join(full_text)

        file.close()

        return final_text

def generate_summary_paper(text,model,tokenizer, min_summary_length=128, max_summary_length=1024, overlap_percentage=35):
    text = preprocess_text(text)
    text = fix_grammar(text)

    tokenized_text = tokenizer.encode(text, return_tensors="pt", truncation=False).to(device)
    # Calculate overlap size
    max_tokens = 1024
    overlap_size = int(max_tokens * overlap_percentage / 100)

    # Generate summaries with overlapping chunks
    summaries = []
    summary_tokens = 0
    for i in range(0, tokenized_text.size(1) - max_tokens + 1, max_tokens - overlap_size):
        start = i
        end = min(i + max_tokens, tokenized_text.size(1))
        chunk = tokenized_text[:, start:end]
        summary_ids = model.generate(chunk, min_length=min_summary_length, max_length=max_summary_length, num_beams=10, early_stopping=True, length_penalty=0.8, repetition_penalty=1.5)
        summary_ids.to(device)
        summary_tokens += summary_ids.size(1)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Concatenate summaries to create the final summary
    final_summary = " ".join(summaries)
    final_summary = final_summary.replace("\n"," ")
    final_summary = fix_grammar(final_summary)
    result = (final_summary,tokenized_text.size(1),summary_tokens)

    del final_summary
    del summaries
    del summary_ids
    del summary_tokens
    del summary
    del chunk
    del text
    del tokenized_text
    gc.collect()

    return result

In [5]:
def count_words(text):
    words = text.split()
    return len(words)

In [6]:
def process_file(file_path,model,tokenizer):
    start_time = time.time()
    paper_content = get_paper_content(file_path)
    result = generate_summary_paper(paper_content,model,tokenizer)
    end_time = time.time()
    processing_time = end_time - start_time
    res =  {"filename":[os.path.basename(file_path)],"gpu_time":[processing_time],"paper_words":[count_words(paper_content)],"summary_words":[count_words(result[0])],"paper_tokens":[result[1]],"summary_tokens":[result[2]]}

    del paper_content
    del processing_time
    del result
    gc.collect()

    print("{} file processed successfully".format(os.path.basename(file_path)))
    return res

def process_files_in_directory(df,directory,model,tokenizer):
    files = [os.path.join(directory, file) for file in os.listdir(directory)]
    for file in files:
        if not df["filename"].str.contains(os.path.basename(file)).any():
          result = process_file(file,model,tokenizer)
          df = pd.concat([df,pd.DataFrame(result)],ignore_index=True)
          df.to_csv(output_file,index=False)
          print("Total {} files Processed".format(len(df) ) )

          # Triggering Garbage Collector
          gc.collect()

In [7]:
if not os.path.exists(output_file):
    df = pd.DataFrame({'filename': [], 'gpu_time': [], "paper_words":[], "summary_words":[], "paper_tokens":[], "summary_tokens":[]})
    df.to_csv(output_file,index=False)

df = pd.read_csv(output_file)

In [8]:
if os.path.isdir(directory):
    print(f"Processing files in directory: {directory}")
    model = BartForConditionalGeneration.from_pretrained(paper_model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,device=device)
    process_files_in_directory(df,directory,model,tokenizer)
else:
    print(f"Error: {directory} is not a valid directory.")

Processing files in directory: /kaggle/input/test-paper-data


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:04<00:00, 52.9MB/s] 
Token indices sequence length is longer than the specified maximum sequence length for this model (9665 > 1024). Running this sequence through the model will result in indexing errors


10113620.pdf file processed successfully
Total 95 files Processed
8606919.pdf file processed successfully
Total 96 files Processed
9745159.pdf file processed successfully
Total 97 files Processed
9107114.pdf file processed successfully
Total 98 files Processed
9328413.pdf file processed successfully
Total 99 files Processed
9490211.pdf file processed successfully
Total 100 files Processed
