## Exploring Themes in AI Regulation: A Comparative NLP Study

### Setup and Imports

Loading the libraries needed for tokenization, stopword removal, and topic modeling.


In [1]:
from utils.extract_pdf_contents import process_pdfs
from utils.clean_texts import clean_EU_legal_text, clean_US_legal_text
from utils.TFIDF_model import load_text, compute_tfidf, save_tfidf_values
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import nltk
import spacy


### 1. Text Extraction

In [2]:
# Define the input and output directories
input_directory = "data/raw" 
output_directory = "data/extracted_text" 


# Call the function to process the PDFs
process_pdfs(input_directory, output_directory)

Extracted text saved to: data/extracted_text/EU_AI_Act_English.txt
Extracted text saved to: data/extracted_text/USA_AI_Executive_Order_English.txt


### 2. Data Wrangling and Loading

In [3]:
# First the EU Act

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/EU_AI_Act_English.txt", "r", encoding="utf-8") as file:
    text = file.read()

clean_EU_text = clean_EU_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/EU_AI_Act_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_EU_text)


# Second, the USA AI Executive Order

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/USA_AI_Executive_Order_English.txt", "r", encoding="utf-8") as file:
    text = file.read()
    clean_US_text = clean_US_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_US_text)

### 3. Tokenization

In [4]:
# Paths to the cleaned text files
text_file_1 = "data/cleaned_text/EU_AI_Act_English_Cleaned.txt"
text_file_2 = "data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt"

# Load the cleaned texts
text1 = load_text(text_file_1)
text2 = load_text(text_file_2)

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "ner",
                                                "lemmatizer", "attibute_ruler"]) 
nltk.download("stopwords")
stop_words_en = nltk.corpus.stopwords.words("english")

# Custom tokenization function
def custom_tokenizer(text):
    tokenized_text = nlp(text)
    return [tok.text.strip() for tok in tokenized_text if tok.text.strip() != '' and not tok.is_punct]

# Manual tokenization using the custom tokenizer (for inspection purposes)
tokens_EU_AI = custom_tokenizer(text1)
tokens_US_AI = custom_tokenizer(text2)

# Saving the tokens to files for inspection
with open("data/tokens/EU_AI_Act_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_EU_AI:
        f.write(f"{token}\n")

with open("data/tokens/USA_AI_Executive_Order_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_US_AI:
        f.write(f"{token}\n")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.0 MB/s[0m  [33m0:00:04[0mm0:00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nicolasreichardt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 4. Feature extraction

#### 4.1. Bag of Words (BoW)

In [5]:
# Vectorizer Configuration for BoW (required for LDA)
tokenizer_bow = CountVectorizer(analyzer="word",
                                tokenizer=custom_tokenizer,
                                lowercase=True,
                                stop_words=stop_words_en)
text1_bow = tokenizer_bow.fit_transform([text1])
print(f"BoW Matrix Dimensions EU AI Act: {text1_bow.shape}")

text2_bow = tokenizer_bow.fit_transform([text2])
print(f"BoW Matrix Dimensions USA AI Executive Order: {text2_bow.shape}")



BoW Matrix Dimensions EU AI Act: (1, 3847)
BoW Matrix Dimensions USA AI Executive Order: (1, 2916)


#### 4.2. TF-IDF

In [6]:
if text1 and text2:
    # Compute TF-IDF
    feature_names, tfidf_matrix = compute_tfidf([text1, text2])
    
    # Save TF-IDF tokens
    output_directory = "data/tfidf_values"
    save_tfidf_values(output_directory, feature_names, tfidf_matrix)

TF-IDF values saved to: data/tfidf_values/tfidf_doc_1.txt
TF-IDF values saved to: data/tfidf_values/tfidf_doc_2.txt


#### 4.3. Embeddings (Word2Vec)

In [7]:
# Import the Word2VecModel class
from utils.word2vec_module import Word2VecModel

# Create and train the Word2Vec model
embedding_documents = [tokens_EU_AI, tokens_US_AI]
word2vec = Word2VecModel(vector_size=100, # size of the embedding vectors
                         window=5, # context window size
                         min_count=2, # minimum frequency for a word to be included
                         workers=4) # number of CPU cores to use

word2vec.train(embedding_documents)

# Save the trained model
word2vec.save("data/embeddings/word2vec")

# Example: Get embedding for a specific word
get_vector = word2vec.get_vector("AI")
print(f"Vector for 'AI': {get_vector}")


Vector for 'AI': [-0.14315392  0.5928339  -0.04445674 -0.10517053  0.43440977 -0.706105
  0.18408835  1.300686   -0.28596613 -0.28029504 -0.21671234 -0.81291133
  0.06605431  0.02548042 -0.10773135 -0.1624678   0.62166744 -0.6045129
 -0.23169054 -1.1981966   0.54091907  0.03514658  1.3500053  -0.49564657
  0.03897101 -0.21780694 -0.6909766   0.13097717 -0.18357953  0.25638697
  1.186942    0.0257459   0.0734195  -0.9116043  -0.35132793  0.6168098
  0.17191288 -0.6434044  -0.09544375 -0.69761413  0.49529818 -0.86432093
 -0.2885197   0.10693899  0.40967798 -0.2996861   0.02132068 -0.20597439
  0.46947056  0.32513988  0.46586987 -0.737623   -0.3926331  -0.2627701
 -0.60077554  0.17687061  0.4329845   0.22261077 -0.62416184  0.39036953
  0.00199227 -0.08014946  0.32926685  0.01535535 -0.41735122  0.8843592
  0.477445    0.9956641  -0.92986685  0.8244599  -0.48314843  0.38430703
  1.1822354  -0.01122706  0.3967893   0.12589954  0.20575415 -0.1413472
 -0.27066198  0.4374104  -0.35997605 -0.3

### 5. Mathematical analysis

#### 5.1. Similarity Measurement (Cosine Similiarity)

In [8]:
import numpy as np

# WORK IN PROGRESS


# Example: Find similar words
similar_words = word2vec.most_similar("AI", topn=5)

print(f"Words similar to 'AI': {similar_words}")

# Example: Compute similarity between two words
similarity = word2vec.similarity("AI", "artificial")

print(f"Similarity between 'AI' and 'artificial': {similarity}")

# Example: Compute similarity between two documents
doc1_vector = word2vec.document_vector(tokens_EU_AI)
doc2_vector = word2vec.document_vector(tokens_US_AI)
doc_similarity = word2vec.cosine_similarity(doc1_vector, doc2_vector)
print(f"Similarity between EU AI Act and USA AI Executive Order: {doc_similarity}")

# Example: Find most similar words to a document
most_similar_to_doc = word2vec.most_similar_to_document(tokens_EU_AI, topn=5)
print(f"Words most similar to EU AI Act: {most_similar_to_doc}")

### TESTING 
# Arithmetic operations with word embeddings
# Example: "regulation" - "law" + "policy" = ?
try:
    result = word2vec.model.wv.most_similar(
        positive=["regulation", "policy"],
        negative=["law"],
        topn=5
    )
    print(f"\n'regulation' - 'law' + 'policy' = {result}")
except KeyError as e:
    print(f"Word not found in vocabulary: {e}")

# Analogy: "risk" is to "assessment" as "data" is to ?
try:
    analogy_result = word2vec.model.wv.most_similar(
        positive=["data", "assessment"],
        negative=["risk"],
        topn=3
    )
    print(f"\nAnalogy - 'risk':'assessment' :: 'data':? = {analogy_result}")
except KeyError as e:
    print(f"Word not found in vocabulary: {e}")

# Find outlier word in a list
try:
    outlier = word2vec.model.wv.doesnt_match(["AI", "algorithm", "technology", "banana"])
    print(f"\nOutlier in ['AI', 'algorithm', 'technology', 'banana']: {outlier}")
except KeyError as e:
    print(f"Word not found in vocabulary: {e}")

# Compute centroid of multiple words (semantic center)
concept_words = ["AI", "data", "model", "system"]
valid_vectors = [word2vec.get_vector(word) for word in concept_words 
                 if word in word2vec.model.wv]
if valid_vectors:
    centroid = np.mean(valid_vectors, axis=0)
    similar_to_centroid = word2vec.model.wv.similar_by_vector(centroid, topn=5)
    print(f"\nWords closest to centroid of {concept_words}: {similar_to_centroid}")

# Measure semantic distance between two documents (Euclidean distance)
euclidean_dist = np.linalg.norm(doc1_vector - doc2_vector)
print(f"\nEuclidean distance between documents: {euclidean_dist:.4f}")

# Find words unique to each document (low cosine similarity)
eu_unique = []
us_unique = []
for word in tokens_EU_AI[:100]:  # Sample first 100 tokens
    if word in word2vec.model.wv:
        sim = word2vec.model.wv.n_similarity([word], tokens_US_AI[:100])
        if sim < 0.3:  # Low similarity threshold
            eu_unique.append((word, sim))

print(f"\nWords more unique to EU document (sample): {eu_unique[:5]}")

Words similar to 'AI': [('to', 0.9998752474784851), ('including', 0.9998546242713928), ('or', 0.9998533725738525), ('for', 0.9998430609703064), ('in', 0.9998379945755005)]
Similarity between 'AI' and 'artificial': 0.9891317486763
Similarity between EU AI Act and USA AI Executive Order: 0.9999977946281433
Words most similar to EU AI Act: [('and', 0.999962329864502), ('or', 0.9999434947967529), ('in', 0.9999428987503052), ('a', 0.9999321103096008), ('including', 0.9999312162399292)]

'regulation' - 'law' + 'policy' = [('30', 0.9151983261108398), ('European', 0.9141279458999634), ('nologies', 0.9138852953910828), ('Council', 0.9138129949569702), ('Parliament', 0.9137921929359436)]

Analogy - 'risk':'assessment' :: 'data':? = [('those', 0.9983172416687012), ('including', 0.9983012676239014), ('related', 0.9982726573944092)]

Outlier in ['AI', 'algorithm', 'technology', 'banana']: technology

Words closest to centroid of ['AI', 'data', 'model', 'system']: [('AI', 0.9999222755432129), ('to',

#### 5.2. Dimensionality Reduction (t-SNE)

In [9]:
# # t-SNE Visualization
# from utils.tsne_visualization import plot_tsne_embeddings
# import numpy as np

# # Prepare embeddings for visualization
# words = list(word2vec.model.wv.index_to_key) # Get all words in the vocabulary
# embeddings = np.array([word2vec.get_vector(word) for word in words])  # Convert to NumPy array for t-SNE function to work

# # Plot t-SNE embeddings and save
# plot_tsne_embeddings(embeddings, words, "plots/tsne_embeddings.png")


### 6. Topic Modeling

#### 6.1. Latent Dirichlet Allocation (LDA)

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

# Combine both documents for topic modeling
combined_bow = tokenizer_bow.fit_transform([text1, text2])
feature_names = tokenizer_bow.get_feature_names_out()

# Initialize and fit LDA model
n_topics = 5  # Number of topics to extract
lda_model = LatentDirichletAllocation(n_components=n_topics,
                                       max_iter=10,
                                       learning_method='online',
                                       random_state=42)
lda_model.fit(combined_bow)

# Display top words for each topic
def display_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

print("LDA Topics:")
display_topics(lda_model, feature_names)

# Get topic distribution for each document
doc_topics = lda_model.transform(combined_bow)
print(f"\nTopic distribution for EU AI Act: {doc_topics[0]}")
print(f"Topic distribution for USA AI Executive Order: {doc_topics[1]}")



LDA Topics:
Topic 1: ai, shall, systems, article, including, system, use, data, market, regulation
Topic 2: ai, systems, regulation, system, article, shall, eu, union, higherisk, data
Topic 3: ai, systems, regulation, shall, system, article, higherisk, eu, data, union
Topic 4: ai, shall, secretary, use, order, including, appropriate, security, within, agencies
Topic 5: ai, regulation, article, system, shall, union, systems, data, eu, law

Topic distribution for EU AI Act: [4.49007183e-06 9.99982005e-01 4.48528245e-06 4.53894054e-06
 4.48116483e-06]
Topic distribution for USA AI Executive Order: [1.46274961e-05 1.48343713e-05 1.46035381e-05 9.99941343e-01
 1.45915004e-05]


#### 6.2. LDA with TF-IDF

In [11]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer with custom tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer,
                                    lowercase=True,
                                    stop_words=stop_words_en)

# Fit and transform the texts
tfidf_for_lda = tfidf_vectorizer.fit_transform([text1, text2])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Initialize and fit LDA model with TF-IDF
n_topics_tfidf = 5
lda_tfidf_model = LatentDirichletAllocation(n_components=n_topics_tfidf,
                                             max_iter=10,
                                             learning_method='online',
                                             random_state=42)
lda_tfidf_model.fit(tfidf_for_lda)

# Display top words for each topic
print("LDA Topics (using TF-IDF):")
display_topics(lda_tfidf_model, tfidf_feature_names)

# Get topic distribution for each document
doc_topics_tfidf = lda_tfidf_model.transform(tfidf_for_lda)
print(f"\nTopic distribution for EU AI Act (TF-IDF): {doc_topics_tfidf[0]}")
print(f"Topic distribution for USA AI Executive Order (TF-IDF): {doc_topics_tfidf[1]}")



LDA Topics (using TF-IDF):
Topic 1: decides, 1560, preparing, equity, influencing, improvement, smooth, considered, inventory, manageability
Topic 2: ai, article, eu, regulation, systems, system, higherisk, union, shall, data
Topic 3: watermarks, act, others, fines, standardised, inform, voice, extraordinary, stopping, period
Topic 4: ai, secretary, shall, agencies, federal, appropriate, days, director, order, security
Topic 5: notifying, label, identical, 98(2, data-, aa, 131, whether, advance, annexed

Topic distribution for EU AI Act (TF-IDF): [0.0112092  0.95511206 0.0112093  0.01125828 0.01121116]
Topic distribution for USA AI Executive Order (TF-IDF): [0.01030532 0.01033629 0.01030087 0.95875435 0.01030316]
