## Exploring Themes in AI Regulation: A Comparative NLP Study

### Setup and Imports

Loading the libraries needed for tokenization, stopword removal, and topic modeling.


In [1]:
from utils.extract_pdf_contents import process_pdfs
from utils.clean_texts import clean_EU_legal_text, clean_US_legal_text
from utils.TFIDF_model import load_text, compute_tfidf, save_tfidf_values
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import nltk
import spacy


import warnings
warnings.filterwarnings("ignore", category=UserWarning)

### 1. Text Extraction

In [2]:
# Define the input and output directories
input_directory = "data/raw" 
output_directory = "data/extracted_text" 


# Call the function to process the PDFs
process_pdfs(input_directory, output_directory)

Extracted text saved to: data/extracted_text/EU_AI_Act_English.txt
Extracted text saved to: data/extracted_text/USA_AI_Executive_Order_English.txt


### 2. Data Wrangling and Loading

In [3]:
# First the EU Act

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/EU_AI_Act_English.txt", "r", encoding="utf-8") as file:
    text = file.read()

clean_EU_text = clean_EU_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/EU_AI_Act_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_EU_text)


# Second, the USA AI Executive Order

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/USA_AI_Executive_Order_English.txt", "r", encoding="utf-8") as file:
    text = file.read()
    clean_US_text = clean_US_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_US_text)

### 3. Tokenization

In [5]:
# Paths to the cleaned text files
text_file_1 = "data/cleaned_text/EU_AI_Act_English_Cleaned.txt"
text_file_2 = "data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt"

# Load the cleaned texts
text1 = load_text(text_file_1)
text2 = load_text(text_file_2)

if not spacy.util.is_package("en_core_web_sm"):
    silent_spacy_download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "ner",
                                                "lemmatizer", "attibute_ruler"]) 
nltk.download("stopwords", quiet=True)
stop_words_en = nltk.corpus.stopwords.words("english")

# Custom tokenization function
def custom_tokenizer(text):
    tokenized_text = nlp(text)
    return [tok.text.strip() for tok in tokenized_text if tok.text.strip() != '' and not tok.is_punct]

# Manual tokenization using the custom tokenizer (for inspection purposes)
tokens_EU_AI = custom_tokenizer(text1)
tokens_US_AI = custom_tokenizer(text2)

# Saving the tokens to files for inspection
with open("data/tokens/EU_AI_Act_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_EU_AI:
        f.write(f"{token}\n")

with open("data/tokens/USA_AI_Executive_Order_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_US_AI:
        f.write(f"{token}\n")


# Load NLTK stopwords
stop_words_en = nltk.corpus.stopwords.words("english")

# Add missing tokenized forms
additional_stopwords = ["'d", "'ll", "'m", "'re", "'s", "'ve", 
                        "could", "might", "must", "n't", "need", 
                        "sha", "wo", "would"]
stop_words_en = list(set(stop_words_en + additional_stopwords))

### 4. Feature extraction

#### 4.1. Bag of Words (BoW)

In [6]:
# Vectorizer Configuration for BoW (required for LDA)
tokenizer_bow = CountVectorizer(analyzer="word",
                                tokenizer=custom_tokenizer,
                                lowercase=True,
                                stop_words=stop_words_en)
text1_bow = tokenizer_bow.fit_transform([text1])
print(f"BoW Matrix Dimensions EU AI Act: {text1_bow.shape}")

text2_bow = tokenizer_bow.fit_transform([text2])
print(f"BoW Matrix Dimensions USA AI Executive Order: {text2_bow.shape}")

BoW Matrix Dimensions EU AI Act: (1, 3842)
BoW Matrix Dimensions USA AI Executive Order: (1, 2912)


#### 4.2. TF-IDF

In [7]:
if text1 and text2:
    # Compute TF-IDF
    feature_names, tfidf_matrix = compute_tfidf([text1, text2])
    
    # Save TF-IDF tokens
    output_directory = "data/tfidf_values"
    save_tfidf_values(output_directory, feature_names, tfidf_matrix)

TF-IDF values saved to: data/tfidf_values/tfidf_doc_1.txt
TF-IDF values saved to: data/tfidf_values/tfidf_doc_2.txt


#### 4.3. Embeddings (Word2Vec)

In [8]:
# Import the Word2VecModel class
from utils.word2vec_module import Word2VecModel

# Create and train the Word2Vec model
embedding_documents = [tokens_EU_AI, tokens_US_AI]
word2vec = Word2VecModel(vector_size=100, # size of the embedding vectors
                         window=5, # context window size
                         min_count=2, # minimum frequency for a word to be included
                         workers=4) # number of CPU cores to use

word2vec.train(embedding_documents)

# Save the trained model
word2vec.save("data/embeddings/word2vec")

# Example: Get embedding for a specific word
get_vector = word2vec.get_vector("AI")
print(f"Vector for 'AI': {get_vector}")


Vector for 'AI': [-0.14866187  0.5967451  -0.05086897 -0.10289579  0.436231   -0.6889783
  0.18889795  1.3001897  -0.279441   -0.28250644 -0.21833727 -0.8097837
  0.067187    0.025595   -0.10132525 -0.16360286  0.6211519  -0.60986346
 -0.24158323 -1.1999272   0.53285295  0.03784047  1.3550559  -0.5035113
  0.04096508 -0.23053542 -0.6991654   0.11641812 -0.18338339  0.25613323
  1.1824374   0.03249398  0.08220202 -0.91154486 -0.35080025  0.61015403
  0.16533157 -0.6529913  -0.10011443 -0.69825554  0.4912114  -0.8590844
 -0.28641987  0.10619823  0.40091848 -0.3081667   0.01335941 -0.2026129
  0.46878043  0.3251462   0.46861973 -0.73733926 -0.3925527  -0.26308906
 -0.59892565  0.17427142  0.43829775  0.22641993 -0.61892223  0.39143327
 -0.00461397 -0.07669893  0.3320811   0.0182785  -0.42427236  0.8801778
  0.46869823  0.997699   -0.9337702   0.82464653 -0.48314092  0.38475496
  1.178694   -0.01402633  0.39689547  0.12208439  0.22020152 -0.14543808
 -0.26899588  0.43343204 -0.3602049  -0.

### 5. Mathematical analysis

#### 5.1. Similarity Measurement (Cosine Similiarity)

In [9]:
import numpy as np

# WORK IN PROGRESS


# Example: Find similar words
similar_words = word2vec.most_similar("AI", topn=5)

print(f"Words similar to 'AI': {similar_words}")

# Example: Compute similarity between two words
similarity = word2vec.similarity("AI", "artificial")

print(f"Similarity between 'AI' and 'artificial': {similarity}")

# Example: Compute similarity between two documents
doc1_vector = word2vec.document_vector(tokens_EU_AI)
doc2_vector = word2vec.document_vector(tokens_US_AI)
doc_similarity = word2vec.cosine_similarity(doc1_vector, doc2_vector)
print(f"Similarity between EU AI Act and USA AI Executive Order: {doc_similarity}")

# Example: Find most similar words to a document
most_similar_to_doc = word2vec.most_similar_to_document(tokens_EU_AI, topn=5)
print(f"Words most similar to EU AI Act: {most_similar_to_doc}")

### TESTING 
# Arithmetic operations with word embeddings
# Example: "regulation" - "law" + "policy" = ?
try:
    result = word2vec.model.wv.most_similar(
        positive=["regulation", "policy"],
        negative=["law"],
        topn=5
    )
    print(f"\n'regulation' - 'law' + 'policy' = {result}")
except KeyError as e:
    print(f"Word not found in vocabulary: {e}")

# Analogy: "risk" is to "assessment" as "data" is to ?
try:
    analogy_result = word2vec.model.wv.most_similar(
        positive=["data", "assessment"],
        negative=["risk"],
        topn=3
    )
    print(f"\nAnalogy - 'risk':'assessment' :: 'data':? = {analogy_result}")
except KeyError as e:
    print(f"Word not found in vocabulary: {e}")

# Find outlier word in a list
try:
    outlier = word2vec.model.wv.doesnt_match(["AI", "algorithm", "technology", "banana"])
    print(f"\nOutlier in ['AI', 'algorithm', 'technology', 'banana']: {outlier}")
except KeyError as e:
    print(f"Word not found in vocabulary: {e}")

# Compute centroid of multiple words (semantic center)
concept_words = ["AI", "data", "model", "system"]
valid_vectors = [word2vec.get_vector(word) for word in concept_words 
                 if word in word2vec.model.wv]
if valid_vectors:
    centroid = np.mean(valid_vectors, axis=0)
    similar_to_centroid = word2vec.model.wv.similar_by_vector(centroid, topn=5)
    print(f"\nWords closest to centroid of {concept_words}: {similar_to_centroid}")

# Measure semantic distance between two documents (Euclidean distance)
euclidean_dist = np.linalg.norm(doc1_vector - doc2_vector)
print(f"\nEuclidean distance between documents: {euclidean_dist:.4f}")

# Find words unique to each document (low cosine similarity)
eu_unique = []
us_unique = []
for word in tokens_EU_AI[:100]:  # Sample first 100 tokens
    if word in word2vec.model.wv:
        sim = word2vec.model.wv.n_similarity([word], tokens_US_AI[:100])
        if sim < 0.3:  # Low similarity threshold
            eu_unique.append((word, sim))

print(f"\nWords more unique to EU document (sample): {eu_unique[:5]}")

Words similar to 'AI': [('to', 0.9998730421066284), ('or', 0.999854564666748), ('including', 0.999853789806366), ('for', 0.9998424053192139), ('in', 0.9998355507850647)]
Similarity between 'AI' and 'artificial': 0.9890989065170288
Similarity between EU AI Act and USA AI Executive Order: 0.999997615814209
Words most similar to EU AI Act: [('and', 0.9999622702598572), ('or', 0.9999439716339111), ('in', 0.9999430775642395), ('a', 0.9999312162399292), ('including', 0.9999299049377441)]

'regulation' - 'law' + 'policy' = [('30', 0.9132196307182312), ('European', 0.9121867418289185), ('Parliament', 0.9120994806289673), ('nologies', 0.9118667244911194), ('Council', 0.9118284583091736)]

Analogy - 'risk':'assessment' :: 'data':? = [('those', 0.9983376860618591), ('including', 0.9983199238777161), ('related', 0.9983078241348267)]

Outlier in ['AI', 'algorithm', 'technology', 'banana']: AI

Words closest to centroid of ['AI', 'data', 'model', 'system']: [('AI', 0.9999215006828308), ('to', 0.9999

#### 5.2. Dimensionality Reduction (t-SNE)

In [10]:
# # t-SNE Visualization
# from utils.tsne_visualization import plot_tsne_embeddings
# import numpy as np

# # Prepare embeddings for visualization
# words = list(word2vec.model.wv.index_to_key) # Get all words in the vocabulary
# embeddings = np.array([word2vec.get_vector(word) for word in words])  # Convert to NumPy array for t-SNE function to work

# # Plot t-SNE embeddings and save
# plot_tsne_embeddings(embeddings, words, "plots/tsne_embeddings.png")


### 6. Topic Modeling

#### 6.1. Latent Dirichlet Allocation (LDA)

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

# Combine both documents for topic modeling
combined_bow = tokenizer_bow.fit_transform([text1, text2])
feature_names = tokenizer_bow.get_feature_names_out()

# Initialize and fit LDA model
n_topics = 5  # Number of topics to extract
lda_model = LatentDirichletAllocation(n_components=n_topics,
                                       max_iter=10,
                                       learning_method='online',
                                       random_state=42)
lda_model.fit(combined_bow)

# Display top words for each topic
def display_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

print("LDA Topics:")
display_topics(lda_model, feature_names)

# Get topic distribution for each document
doc_topics = lda_model.transform(combined_bow)
print(f"\nTopic distribution for EU AI Act: {doc_topics[0]}")
print(f"Topic distribution for USA AI Executive Order: {doc_topics[1]}")

LDA Topics:
Topic 1: ai, shall, systems, system, regulation, data, article, use, union, including
Topic 2: ai, systems, regulation, system, article, shall, eu, union, higherisk, data
Topic 3: ai, shall, secretary, use, order, including, appropriate, security, within, days
Topic 4: ai, regulation, systems, shall, eu, article, law, system, authorities, union
Topic 5: ai, shall, systems, use, system, including, article, regulation, secretary, appropriate

Topic distribution for EU AI Act: [4.49848979e-06 9.99981954e-01 4.55218449e-06 4.48661941e-06
 4.50873182e-06]
Topic distribution for USA AI Executive Order: [1.46590352e-05 1.48643754e-05 9.99941155e-01 1.46203212e-05
 1.47016379e-05]


#### 6.2. LDA with TF-IDF

In [12]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer with custom tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer,
                                    lowercase=True,
                                    stop_words=stop_words_en)

# Fit and transform the texts
tfidf_for_lda = tfidf_vectorizer.fit_transform([text1, text2])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Initialize and fit LDA model with TF-IDF
n_topics_tfidf = 5
lda_tfidf_model = LatentDirichletAllocation(n_components=n_topics_tfidf,
                                             max_iter=10,
                                             learning_method='online',
                                             random_state=42)
lda_tfidf_model.fit(tfidf_for_lda)

# Display top words for each topic
print("LDA Topics (using TF-IDF):")
display_topics(lda_tfidf_model, tfidf_feature_names)

# Get topic distribution for each document
doc_topics_tfidf = lda_tfidf_model.transform(tfidf_for_lda)
print(f"\nTopic distribution for EU AI Act (TF-IDF): {doc_topics_tfidf[0]}")
print(f"Topic distribution for USA AI Executive Order (TF-IDF): {doc_topics_tfidf[1]}")

LDA Topics (using TF-IDF):
Topic 1: deciding, 1560, presence, equivalent, infor-, improvements, societies, considered, inves-, managed
Topic 2: ai, shall, systems, including, regulation, secretary, data, article, system, eu
Topic 3: wheel, actors, overfitting, states, fitting, infringe, want, facilitating, person, strict
Topic 4: helps, 2006, substantiated, navigating, rather, gagements, essary, explain, entrepreneurs, 5.2017
Topic 5: obligation, last, illustrations, abusive, december, accelerate, 14111, work-, advisors, appeal

Topic distribution for EU AI Act (TF-IDF): [0.01122038 0.95512512 0.01121686 0.01121364 0.011224  ]
Topic distribution for USA AI Executive Order (TF-IDF): [0.01032565 0.95871889 0.01031669 0.01031425 0.01032452]
