In [39]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from PyPDF2 import PdfReader
from collections import Counter
import pandas as pd
import math


In [2]:
# Download NLTK resources
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /Users/new/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
def process_pdf(pdf_path, password=None):
  """
  Extracts text from a PDF and performs basic cleaning.

  Args:
    pdf_path (str): Path to the PDF file.
    password (str, optional): Password for the PDF (if password-protected). Defaults to None.

  Returns:
    str: Cleaned text extracted from the PDF, or None if there's an issue.
  """

  # Check if PDF path exists
  if not os.path.exists(pdf_path):
    print(f"Error: PDF file not found: {pdf_path}")
    return None

  text = ""
  try:
    with open(pdf_path, 'rb') as f:
      reader = PdfReader(f, password=password)  # Provide password if needed
      for page in reader.pages:
        text += page.extract_text()

    # Basic validation on extracted text
    if not text:
      print(f"Warning: Empty text extracted from PDF: {pdf_path}")
      return None

  except (IOError, PdfReaderError) as e:
    print(f"Error processing PDF {pdf_path}: {e}")
    return None

  # Preprocess text (optional)
  sentences = nltk.sent_tokenize(text)  # Tokenize into sentences
  words = [w.lower() for w in nltk.word_tokenize(text) if w.isalpha()]  # Tokenize, lowercase, filter alphanumeric

  # Remove stop words (optional)
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if w not in stop_words]

  # Join words back into a string
  cleaned_text = " ".join(words)

  return cleaned_text



def process_pdfs(data_dir, password=None):
  """
  Processes all PDFs within a directory structure, recursively searching subfolders.

  Args:
    data_dir (str): Path to the top-level directory containing PDFs.
    password (str, optional): Password for the PDFs (if password-protected). Defaults to None.

  Returns:
    list: List of tuples, where each tuple contains (PDF path, processed text).
  """
  numPDF = 0
  corpus = []
  for root, _, files in os.walk(data_dir):
    for filename in files:
      if filename.endswith(".pdf"):
        pdf_path = os.path.join(root, filename)
        numPDF = numPDF + 1
        print(f"Processing PDF: {pdf_path}")
        print(f"There are this many PDFs:{numPDF}")
        text = process_pdf(pdf_path, password)
        if text:
          corpus.append((pdf_path, text))

  return corpus


In [21]:
# data_dir path points to top-level directory containing PDFs
data_dir = "files"

# Process PDFs and create corpus
pdf_data = process_pdfs(data_dir)

Processing PDF: files/1450/Demertzis et al_2023_Federated Auto-Meta-Ensemble Learning Framework for AI-Enabled Military.pdf
There are this many PDFs:1
Processing PDF: files/1457/Mao et al_2022_Trustworthy AI Solutions for Cyberbiosecurity Challenges in Water Supply Systems.pdf
There are this many PDFs:2
Processing PDF: files/1468/Paul_2023_A survey of technologies supporting design of a multimodal interactive robot.pdf
There are this many PDFs:3
Processing PDF: files/1461/Xiao et al_2022_Guest Editorial.pdf
There are this many PDFs:4
Processing PDF: files/1466/Miljković_Beriša_2023_Application of artificial intelligence in modern warfare.pdf
There are this many PDFs:5
Processing PDF: files/1459/Schuett_2022_Three lines of defense against risks from AI.pdf
There are this many PDFs:6
Processing PDF: files/1467/Nalin_Tripodi_2023_Future Warfare and Responsibility Management in the AI-based Military.pdf
There are this many PDFs:7
Processing PDF: files/1458/Moreno et al_2022_The ethics of A

In [55]:

"""
vectorizer = TfidfVectorizer(max_features=1000)

    This line creates a TfidfVectorizer object from the sklearn.feature_extraction.text library.
    
    TfidfVectorizer is used to convert text data into a numerical representation suitable for machine learning algorithms like TF-IDF.
    
    The max_features parameter (set to 1000 here) specifies the maximum number of features (words) the vectorizer will consider.
    
    This helps control the dimensionality of the resulting TF-IDF matrix.
    
    You can adjust this value based on your specific needs and the size of your corpus.

tfidf_matrix = vectorizer.fit_transform([text for path, text in pdf_data])

    Fit: It calls the fit method on the vectorizer.
    
    This method analyzes the text data provided (the list comprehension) to build the vocabulary. 
    
    The vocabulary is essentially a list of unique words encountered across all your PDFs.
    
    Transform: After fitting, the transform method is called. This method uses the created vocabulary to convert the text data from your pdf_data list into a numerical representation suitable for TF-IDF analysis. 
    
    The output is stored in the tfidf_matrix variable.
"""

vectorizer = TfidfVectorizer(max_features=100)  # Adjust max_features as needed
tfidf_matrix = vectorizer.fit_transform([text for path, text in pdf_data])

In [56]:
print(type(tfidf_matrix), tfidf_matrix.shape)

<class 'scipy.sparse._csr.csr_matrix'> (61, 100)


In [57]:
"""
    This list of feature names represents the vocabulary created by the TF-IDF vectorizer for the entire corpus.
"""
feature_names = vectorizer.get_feature_names_out()
print(feature_names)
print(tfidf_matrix)

['actions' 'adversarial' 'ai' 'al' 'also' 'analysis' 'applications'
 'approach' 'article' 'artificial' 'attack' 'attacks' 'autonomous'
 'autonomy' 'aws' 'based' 'challenges' 'conference' 'control' 'could'
 'data' 'decision' 'decisions' 'defense' 'development' 'different'
 'environment' 'et' 'ethical' 'ethics' 'even' 'example' 'framework'
 'future' 'harm' 'however' 'https' 'human' 'humans' 'ieee' 'information'
 'intelligence' 'international' 'journal' 'law' 'laws' 'learning' 'legal'
 'lethal' 'level' 'machine' 'make' 'making' 'may' 'methods' 'might'
 'military' 'model' 'models' 'moral' 'need' 'new' 'nuclear' 'one'
 'operations' 'point' 'potential' 'pp' 'process' 'research'
 'responsibility' 'risk' 'risks' 'robot' 'robots' 'security' 'see' 'state'
 'states' 'strategic' 'system' 'systems' 'technologies' 'technology'
 'time' 'training' 'two' 'university' 'us' 'use' 'used' 'using' 'vol'
 'war' 'warfare' 'weapon' 'weapons' 'within' 'work' 'would']
  (0, 70)	0.008096227640261737
  (0, 27)	0.0

In [58]:
tf_idf_array = tfidf_matrix.toarray()
print(tfidf_array)

[[0.06934346 0.0075204  0.46512884 0.04093931 0.01386869 0.16329141
  0.85460348 0.09161629 0.10401279 0.02642602]
 [0.78131857 0.         0.34031035 0.         0.         0.
  0.         0.         0.52319228 0.        ]
 [0.06376682 0.23710648 0.26385505 0.0448178  0.62400392 0.48801814
  0.12071825 0.3610648  0.30317007 0.06075207]
 [0.69018011 0.         0.49100315 0.         0.         0.
  0.27438469 0.35071653 0.25881154 0.13150998]
 [0.63826492 0.02768831 0.         0.32657888 0.07659179 0.66131971
  0.         0.         0.19147504 0.0729707 ]
 [0.58149128 0.         0.02037835 0.37487163 0.00802057 0.0094435
  0.71743927 0.02852969 0.05263377 0.04966893]
 [0.76567761 0.19215604 0.06431087 0.07471806 0.28475614 0.20116488
  0.0503139  0.20579478 0.43305659 0.11454614]
 [0.42744625 0.0189213  0.20390931 0.08583595 0.26170179 0.81140997
  0.         0.06205936 0.0981359  0.1662194 ]
 [0.         0.45679112 0.06824166 0.04204506 0.02441705 0.
  0.26694589 0.53972952 0.6409326  0.

In [59]:
df_tf_idf = pd.DataFrame(tf_idf_array, columns = feature_names)

df_tf_idf

Unnamed: 0,actions,adversarial,ai,al,also,analysis,applications,approach,article,artificial,...,used,using,vol,war,warfare,weapon,weapons,within,work,would
0,0.000000,0.015893,0.051981,0.006435,0.025574,0.116224,0.045100,0.049917,0.017764,0.000000,...,0.047546,0.095837,0.000000,0.007389,0.019969,0.000000,0.000000,0.000000,0.022185,0.022185
1,0.000000,0.000000,0.627556,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.025316,0.017077,0.031280,0.271052,0.024183,0.059038,0.092078,0.033375,0.002545,0.019039,...,0.086287,0.118732,0.161968,0.000000,0.002861,0.006240,0.011641,0.023287,0.021456,0.026223
3,0.000000,0.147290,0.240879,0.119275,0.165913,0.000000,0.130618,0.025702,0.027439,0.029322,...,0.122403,0.000000,0.068478,0.000000,0.000000,0.000000,0.000000,0.000000,0.128508,0.000000
4,0.000000,0.000000,0.374949,0.000000,0.014758,0.015242,0.016265,0.000000,0.000000,0.419914,...,0.000000,0.000000,0.042637,0.063955,0.038411,0.020943,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.005744,0.034875,0.059316,0.014121,0.069590,0.039416,0.061855,0.048684,0.046779,0.000000,...,0.057965,0.098968,0.000000,0.000000,0.008764,0.000000,0.000000,0.029062,0.058421,0.000000
57,0.060979,0.000000,0.475125,0.000000,0.034042,0.021096,0.007504,0.012305,0.000000,0.030884,...,0.046881,0.035019,0.006557,0.157364,0.035442,0.000000,0.012017,0.098828,0.063985,0.036914
58,0.022989,0.000000,0.649538,0.000000,0.023101,0.068929,0.011316,0.097417,0.011886,0.050807,...,0.103394,0.022632,0.007416,0.000000,0.000000,0.000000,0.000000,0.111775,0.080717,0.061233
59,0.040544,0.024615,0.005367,0.039866,0.021125,0.000000,0.017463,0.034361,0.000000,0.019601,...,0.005455,0.029105,0.007629,0.000000,0.000000,0.007495,0.000000,0.093237,0.005727,0.005727


In [25]:
#keywords = ["autonomy"]

In [8]:
"""keyword_scores = {}
for keyword in keywords:
  # Check if keyword exists in feature_names before accessing the index
  if keyword in feature_names:
    keyword_scores[keyword] = tfidf_matrix.sum(axis=0)[feature_names.tolist().index(keyword)]
  else:
    # Assign 0 score if the keyword is not found
    keyword_scores[keyword] = 0


pdf_scores = {}
for i, (pdf_path, _) in enumerate(pdf_data):
  # Iterate through keyword scores and accumulate for each PDF
  pdf_scores[pdf_path] = 0
  for keyword, score in keyword_scores.items():
    pdf_scores[pdf_path] += score

# Print or analyze PDF scores based on keyword importance
print(pdf_scores)  # This will print a dictionary with scores for each PDF"""



'keyword_scores = {}\nfor keyword in keywords:\n  # Check if keyword exists in feature_names before accessing the index\n  if keyword in feature_names:\n    keyword_scores[keyword] = tfidf_matrix.sum(axis=0)[feature_names.tolist().index(keyword)]\n  else:\n    # Assign 0 score if the keyword is not found\n    keyword_scores[keyword] = 0\n\n\npdf_scores = {}\nfor i, (pdf_path, _) in enumerate(pdf_data):\n  # Iterate through keyword scores and accumulate for each PDF\n  pdf_scores[pdf_path] = 0\n  for keyword, score in keyword_scores.items():\n    pdf_scores[pdf_path] += score\n\n# Print or analyze PDF scores based on keyword importance\nprint(pdf_scores)  # This will print a dictionary with scores for each PDF'

In [60]:
# Calculate term frequency (TF) for each word in the corpus
term_frequency = {}
for path, text in pdf_data:
  for word in text.split():
    term_frequency[word] = term_frequency.get(word, 0) + 1

In [61]:
print(term_frequency)



In [62]:
# Calculate Inverse Document Frequency (IDF) for each word
document_frequency = len(pdf_data)
inverse_document_frequency = {}
for word, count in term_frequency.items():
  inverse_document_frequency[word] = math.log(document_frequency / sum(1 for path, text in pdf_data if word in text))

In [63]:
print(inverse_document_frequency)



In [64]:
# Calculate TF-IDF score for each word in the corpus
tf_idf = {}
for path, text in pdf_data:
  words = text.split()
  for word in words:
    tf_idf[(path, word)] = term_frequency[word] * inverse_document_frequency[word]

In [65]:
# Identify most unique PDFs based on TF-IDF scores
unique_pdfs = []
for path, text in pdf_data:
  # Calculate average TF-IDF score for each PDF
  pdf_score = sum(score for word, score in tf_idf.items() if word in text.split()) / len(text.split())
  unique_pdfs.append((path, pdf_score))

In [66]:
# Sort PDFs by their scores in descending order (most unique first)
unique_pdfs.sort(key=lambda x: x[1], reverse=True)

In [68]:
# Print names of the top N most unique PDFs (adjust N as desired)
num_unique_to_print = 10
for i in range(min(num_unique_to_print, len(unique_pdfs))):
  path, score = unique_pdfs[i]
  print(f"Unique PDF: {path} (TF-IDF Score: {score:.2f})")

Unique PDF: files/1450/Demertzis et al_2023_Federated Auto-Meta-Ensemble Learning Framework for AI-Enabled Military.pdf (TF-IDF Score: 0.00)
Unique PDF: files/1457/Mao et al_2022_Trustworthy AI Solutions for Cyberbiosecurity Challenges in Water Supply Systems.pdf (TF-IDF Score: 0.00)
Unique PDF: files/1468/Paul_2023_A survey of technologies supporting design of a multimodal interactive robot.pdf (TF-IDF Score: 0.00)
Unique PDF: files/1461/Xiao et al_2022_Guest Editorial.pdf (TF-IDF Score: 0.00)
Unique PDF: files/1466/Miljković_Beriša_2023_Application of artificial intelligence in modern warfare.pdf (TF-IDF Score: 0.00)
Unique PDF: files/1459/Schuett_2022_Three lines of defense against risks from AI.pdf (TF-IDF Score: 0.00)
Unique PDF: files/1467/Nalin_Tripodi_2023_Future Warfare and Responsibility Management in the AI-based Military.pdf (TF-IDF Score: 0.00)
Unique PDF: files/1458/Moreno et al_2022_The ethics of AI-assisted warfighter enhancement research and experimentation.pdf (TF-IDF