In [1]:
from utils.extract_pdf_contents import process_pdfs
from utils.clean_texts import clean_EU_legal_text, clean_US_legal_text
from utils.TFIDF_model import load_text, compute_tfidf, save_tfidf_values
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import nltk
import spacy


## Research paper NLP Pipeline

### 1. Extract the text from the two pdf documents

In [2]:
# Define the input and output directories
input_directory = "data/raw"  # Directory containing the PDF files
output_directory = "data/extracted_text"  # Directory to save the extracted text files


# Call the function to process the PDFs
process_pdfs(input_directory, output_directory)

Extracted text saved to: data/extracted_text/EU_AI_Act_English.txt
Extracted text saved to: data/extracted_text/USA_AI_Executive_Order_English.txt


### 2. Cleanup the extracted text

In [3]:
# First the EU Act

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/EU_AI_Act_English.txt", "r", encoding="utf-8") as file:
    text = file.read()

clean_EU_text = clean_EU_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/EU_AI_Act_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_EU_text)


# Second, the USA AI Executive Order

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/USA_AI_Executive_Order_English.txt", "r", encoding="utf-8") as file:
    text = file.read()
    clean_US_text = clean_US_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_US_text)

### 3. Tokenization

In [4]:
# Paths to the cleaned text files
text_file_1 = "data/cleaned_text/EU_AI_Act_English_Cleaned.txt"
text_file_2 = "data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt"

# Load the cleaned texts
text1 = load_text(text_file_1)
text2 = load_text(text_file_2)


# JUSTIFY THE CHOICES
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "ner",
                                                "lemmatizer", "attibute_ruler"]) 
nltk.download("stopwords")
stop_words_en = nltk.corpus.stopwords.words("english")

# Custom tokenization function
def custom_tokenizer(text): # wrap tokenizer in custom function
    tokenized_text = nlp(text)
    # Remove empty tokens
    return [tok.text.strip() for tok in tokenized_text if tok.text.strip() !='']

# Manual tokenization using the custom tokenizer (for inspection purposes)
tokens_EU_AI = custom_tokenizer(text1)
tokens_US_AI = custom_tokenizer(text2)

# Saving the tokens to files for inspection
with open("data/tokens/EU_AI_Act_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_EU_AI:
        f.write(f"{token}\n")

with open("data/tokens/USA_AI_Executive_Order_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_US_AI:
        f.write(f"{token}\n")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.7 MB/s[0m  [33m0:00:02[0m eta [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nicolasreichardt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 4. Feature extraction

#### 4.1. Bag of Words (BoW)

In [5]:
# Vectorizer Configuration for BoW (required for LDA)
tokenizer_bow = CountVectorizer(analyzer="word",
                                       tokenizer=custom_tokenizer,
                                       lowercase=True,
                                       stop_words=stop_words_en)
                                      # max_df= 0.80, # Ignore terms that appear in more than 80% of the documents
                                      # min_df= 0.01) # Ignore terms that appear in less than 1% of the documents
text1_bow = tokenizer_bow.fit_transform([text1])
print(f"BoW Matrix Dimensions EU AI Act: {text1_bow.shape}")

text2_bow = tokenizer_bow.fit_transform([text2])
print(f"BoW Matrix Dimensions USA AI Executive Order: {text2_bow.shape}")



BoW Matrix Dimensions EU AI Act: (1, 4823)
BoW Matrix Dimensions USA AI Executive Order: (1, 2933)


#### 4.2. TF-IDF

In [6]:
if text1 and text2:
    # Compute TF-IDF
    feature_names, tfidf_matrix = compute_tfidf([text1, text2])
    
    # Save TF-IDF tokens
    output_directory = "data/tfidf_values"
    save_tfidf_values(output_directory, feature_names, tfidf_matrix)

TF-IDF values saved to: data/tfidf_values/tfidf_doc_1.txt
TF-IDF values saved to: data/tfidf_values/tfidf_doc_2.txt


#### 4.3. Embeddings (Word2Vec)

In [7]:
# Import the Word2VecModel class
from utils.word2vec_module import Word2VecModel

# Create and train the Word2Vec model
embedding_documents = [tokens_EU_AI, tokens_US_AI]
word2vec = Word2VecModel(vector_size=100, # size of the embedding vectors
                         window=5, # context window size
                         min_count=2, # minimum frequency for a word to be included
                         workers=4) # number of CPU cores to use

word2vec.train(embedding_documents)

# Save the trained model
word2vec.save("data/embeddings/word2vec")

# Example: Get embedding for a specific word
get_vector = word2vec.get_vector("AI")
print(f"Vector for 'AI': {get_vector}")


Vector for 'AI': [-0.864503    0.5312424   0.2557926   0.09263273  0.03348938 -0.7565792
  1.0187244   1.208634   -0.7137521  -0.57220083 -0.11228784 -0.77105045
  0.35022008  0.37116998 -0.06001113 -0.00551844  0.44737014 -0.67138547
 -0.5925652  -1.5441421   0.747752    0.2583962   0.9695456  -0.6705056
 -0.04810711  0.2889923  -0.561146    0.3145709  -0.87775874  0.7470655
  0.94286495  0.15985511  0.16406439 -1.275775    0.10138124  0.6998651
  0.22416256 -0.581004   -0.8349781  -0.36822549 -0.09820789 -0.71208465
 -0.13170435  0.44897392  1.2698581  -0.24045841 -0.5374426   0.17332175
 -0.01256452  0.34176955  0.751049   -0.18348399 -0.46109664 -0.07608254
 -0.9116585  -0.1416835   0.6660579  -0.35826257 -0.64866215  0.3111708
 -0.4033421  -0.5010293   0.59806955 -0.03618123 -0.7581635   0.66951174
  0.29917938  0.9757586  -1.1362847   0.66910064 -0.39193165  0.7553046
  0.79703945  0.20851555  0.45631436 -0.08683646  0.34469447  0.15794097
 -0.21731193  0.35287058 -0.84959173  0.

#### 4.4. (Optional) Text representation through OpenAI contextualised embeddings

### 5. Mathematical analysis

#### 5.1. Similarity Measurement (Cosine Similiarity)

In [8]:
# WORK IN PROGRESS


# Example: Find similar words
similar_words = word2vec.most_similar("AI", topn=5)

print(f"Words similar to 'AI': {similar_words}")

# Example: Compute similarity between two words
similarity = word2vec.similarity("AI", "artificial")

print(f"Similarity between 'AI' and 'artificial': {similarity}")

# Example: Compute similarity between two documents
doc1_vector = word2vec.document_vector(tokens_EU_AI)
doc2_vector = word2vec.document_vector(tokens_US_AI)
doc_similarity = word2vec.cosine_similarity(doc1_vector, doc2_vector)
print(f"Similarity between EU AI Act and USA AI Executive Order: {doc_similarity}")

# Example: Find most similar words to a document
most_similar_to_doc = word2vec.most_similar_to_document(tokens_EU_AI, topn=5)
print(f"Words most similar to EU AI Act: {most_similar_to_doc}")


Words similar to 'AI': [('or', 0.9998688697814941), ('to', 0.9998674988746643), ('that', 0.9998657703399658), ('-', 0.9998598098754883), ('are', 0.999848484992981)]
Similarity between 'AI' and 'artificial': 0.9851690530776978
Similarity between EU AI Act and USA AI Executive Order: 0.9999975562095642
Words most similar to EU AI Act: [('and', 0.9999430775642395), ('the', 0.9999364614486694), (',', 0.9999351501464844), ('.', 0.9999253153800964), ('-', 0.9999250173568726)]


#### 5.2. Dimensionality Reduction (t-SNE)

In [8]:
# t-SNE Visualization
from utils.tsne_visualization import plot_tsne_embeddings
import numpy as np

# Prepare embeddings for visualization
words = list(word2vec.model.wv.index_to_key) # Get all words in the vocabulary
embeddings = np.array([word2vec.get_vector(word) for word in words])  # Convert to NumPy array for t-SNE function to work

# Plot t-SNE embeddings and save
plot_tsne_embeddings(embeddings, words, "plots/tsne_embeddings.png")


#### 5.2. Dimensionality Reduction (PCA) - Optional?

#### 5.3. Other techniques?

### 5. Topic modeling

#### 5.1. Latent Dirichlet Allocation (LDA)

#### 5.2. BERTopic