In [None]:
# PDF Extractor
# Obinna Kalu

In [15]:
import os
from collections import Counter
from PIL import ImageFont, ImageDraw
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk.data
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.util import ngrams
import PyPDF2
from PyPDF2 import PdfReader

import termcolor
from termcolor import colored


[nltk_data] Downloading package stopwords to /Users/new/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Define dataset directory
data_dir = "files"

In [17]:
files = os.listdir(data_dir)
print (files)

['1450', '1457', '1468', '1461', '1466', '1459', '1467', '1458', '1460', '1456', '1469', '1451', '1523', '.DS_Store', '1343', '1513', '1335', '1508', '1530', '1506', '1501', '1487', '1473', '1474', '1480', '1488', '1481', '1475', '1472', '1486', '1454', '1453', '1465', '1496', '1462', '1463', '1464', '1452', '1455', '1439', '1527', '1516', '1511', '1510', '1528', '1738', '1339', '1737', '1448', '1477', '1483', '1484', '1470', '1479', '1446', '1478', '1447', '1471', '1485', '1449', '1482', '1476']


In [18]:

# Download NLTK resources (download only once)
nltk.download('punkt')  # Download sentence tokenizer

[nltk_data] Downloading package punkt to /Users/new/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:

def process_pdf(pdf_path, password=None):
  """
  Extracts text, word counts, and phrase counts from a PDF.

  Args:
      pdf_path (str): Path to the PDF file.
      password (str, optional): Password for the PDF (if password-protected). Defaults to None.

  Returns:
      tuple: A tuple containing (text, word_counts, phrase_counts) or (None, None, None) if an error occurs.
          - text (str): Extracted text from the PDF.
          - word_counts (Counter): Counter object containing word frequencies.
          - phrase_counts (Counter): Counter object containing bigram (2-word phrase) frequencies.
  """

  try:
      text = ""
      # Extract text from PDF
      with open(pdf_path, 'rb') as f:
          reader = PdfReader(f, password=password)  # Provide password if needed
          for page in reader.pages:
              text += page.extract_text()

      # Preprocess text (optional)
      sentences = nltk.sent_tokenize(text)  # Tokenize into sentences
      words = [w.lower() for w in nltk.word_tokenize(text) if w.isalpha()]  # Tokenize, lowercase, filter alphanumeric

      # Remove stop words (optional)
      stop_words = set(stopwords.words('english'))
      words = [w for w in words if w not in stop_words]

      # Create word and phrase frequency counts
      word_counts = Counter(words)

      # Consider using stemming/lemmatization for broader model coverage (optional)
      # stemmed_words = [nltk.PorterStemmer().stem(w) for w in words]  # Example using Porter stemmer
      # word_counts = Counter(stemmed_words)

      phrase_counts = Counter(ngrams(words, 2))  # Count bigrams (2-word phrases)

      return text, word_counts, phrase_counts
  except (IOError, PyPDF2.errors.PdfReaderError) as e:
      print(f"Error processing PDF {pdf_path}: {e}")
      return None, None, None

def generate_wordcloud(text, filename, colormap='bright'):
    """
    Generates a colorized word frequency list using termcolor.

    Args:
        text (str): Text content for word frequency analysis.
        filename (str): Filename for reference (used for display title).
        colormap (str, optional): Colormap for termcolor (default: 'bright').
    """

    # Create word frequency dictionary
    word_counts = Counter(nltk.word_tokenize(text.lower()))
    max_count = max(word_counts.values())

    # Display word cloud with filename reference
    print(f"\n**Word Cloud for {filename}**\n")

    for word, count in word_counts.most_common():
        # Color weighting based on frequency (higher frequency = brighter color)
        color_weight = int(count * 255 / max_count)
        colored_word = colored(word, color='red', attrs=['bold'])  # Example using 'red'
        print(f"{colored_word:<20} - {count}")  # Align words and format output

def find_pdfs_with_phrase(phrase, pdf_data):
  """
  Finds all PDFs containing a specific phrase.

  Args:
      phrase (str): The phrase to search for.
      pdf_data (dict): Dictionary containing processed PDF data.

  Returns:
      list: A list of tuples containing (pdf_path, phrase_count) for all PDFs with the phrase.
  """

  matching_pdfs = []
  for pdf_path, data in pdf_data.items():
    if phrase in data[2]:  # Check if phrase exists in phrase_counts dictionary
      phrase_count = data[2][phrase]
      matching_pdfs.append((pdf_path, phrase_count))
  return matching_pdfs

def find_top_phrases(phrase, pdf_data):
    """
    Finds the top 10 PDFs based on the frequency of a specific phrase.

    Args:
        phrase (str): The phrase to search for.
        pdf_data (dict): Dictionary containing processed PDF data.

    Returns:
        list: A list of tuples containing (pdf_path, phrase_count) for the top 10 PDFs.
    """

    phrase_counts = [data[2][phrase] for data in pdf_data.values() if phrase in data[2]]  # Extract phrase count for each PDF
    top_10_pdfs = sorted(zip(pdf_data.keys(), phrase_counts), key=lambda x: x[1], reverse=True)[:10]  # Sort and get top 10
    return top_10_pdfs


def load_dataset(data_dir="files"):
    """
    Loads the PDF dataset from a specified directory and prints the path for each file.

    Args:
        data_dir (str, optional): The directory containing the PDF files. Defaults to "files".

    Returns:
        list: A list of tuples containing (pdf_path, label).
    """

    images = []
    labels = []

    # Iterate over the PDF files in the dataset path
    for file in os.listdir(data_dir):
        subfolder = os.path.join(data_dir, file)
        if os.path.isdir(subfolder):
            # Iterate over images in the subfolder
            for pdf_file in os.listdir(subfolder):
                if pdf_file.endswith('.pdf'):
                    # Provide the full path to the PDF file
                    pdf_path = os.path.join(subfolder, pdf_file)
                    label = file  # Assuming label is based on subfolder name

                    # Print the file path
                    print(pdf_path)

                    images.append(pdf_path)
                    labels.append(label)

    return list(zip(images, labels))  # Return a list of tuples (pdf_path, label)

def find_relevant_pdfs(pdf_paths, search_terms):
  """
  Finds PDFs in the list that are most relevant to the given search terms.

  Args:
      pdf_paths (list): List of paths to PDF files.
      search_terms (list): List of search terms (words or phrases).

  Returns:
      list: List of dictionaries containing information about relevant PDFs.
          Each dictionary has the following keys:
              - path (str): Path to the PDF file.
              - score (float): Relevancy score based on search term frequency.
  """

  relevant_pdfs = []
  for pdf_path in pdf_paths:
      text, word_counts, _ = process_pdf(pdf_path)  # Phrase counts not used here, optional for future use
      if text:
          score = 0
          # Search for all search terms (consider stemming/lemmatization if used earlier)
          for term in search_terms:
              score += word_counts.get(term.lower(), 0)  # Count occurrences of each search term (lowercase)
          relevant_pdfs.append({
              "path": pdf_path,
              "score": score
          })
  # Sort PDFs by score in descending order (most relevant first)
  relevant_pdfs.sort(key=lambda x: x["score"], reverse=True)
  return relevant_pdfs

In [20]:

# Load the PDF dataset
pdf_data = {}
for pdf_path, label in load_dataset():
    text, word_counts, phrase_counts = process_pdf(pdf_path)
    pdf_data[pdf_path] = (text, word_counts, phrase_counts, label)

    # Generate and display word cloud
    #generate_wordcloud(text, label)  # Use `label` for filename reference

files/1450/Demertzis et al_2023_Federated Auto-Meta-Ensemble Learning Framework for AI-Enabled Military.pdf
files/1457/Mao et al_2022_Trustworthy AI Solutions for Cyberbiosecurity Challenges in Water Supply Systems.pdf
files/1468/Paul_2023_A survey of technologies supporting design of a multimodal interactive robot.pdf
files/1461/Xiao et al_2022_Guest Editorial.pdf
files/1466/Miljković_Beriša_2023_Application of artificial intelligence in modern warfare.pdf
files/1459/Schuett_2022_Three lines of defense against risks from AI.pdf
files/1467/Nalin_Tripodi_2023_Future Warfare and Responsibility Management in the AI-based Military.pdf
files/1458/Moreno et al_2022_The ethics of AI-assisted warfighter enhancement research and experimentation.pdf
files/1460/Wang_Chapman_2022_Risk-averse autonomous systems.pdf
files/1456/Lee_Jang_2022_Necessity of establishing an open source military R&D platform to promote AI.pdf
files/1469/Rashid et al_2023_Artificial Intelligence in the Military.pdf
files/1

In [21]:
user_phrases = input("Enter key phrases (comma-separated) to search for: ")
user_phrases = user_phrases.split(",")  # Split the input string into a list of phrases

relevant_pdfs = find_relevant_pdfs(user_phrases, pdf_data)

if relevant_pdfs:
  print(f"\nPDFs most relevant to '{user_phrases}':")
  sorted_results = sorted(relevant_pdfs.items(), key=lambda x: x[1], reverse=True)
  for pdf_path, score in sorted_results:
    print(f"- {pdf_path} (Score: {score})")
else:
  print(f"No PDFs found containing any of the phrases: {user_phrases}")

Enter key phrases (comma-separated) to search for:  model


AttributeError: module 'PyPDF2.errors' has no attribute 'PdfReaderError'