In [3]:
from utils.extract_pdf_contents import process_pdfs
from utils.clean_texts import clean_EU_legal_text, clean_US_legal_text
from utils.tokenize_TFIDF import load_text, compute_tfidf, save_tfidf_values
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import nltk
import spacy




## Research paper NLP Pipeline

### 1. Extract the text from the two pdf documents

In [4]:
# Define the input and output directories
input_directory = "data/raw"  # Directory containing the PDF files
output_directory = "data/extracted_text"  # Directory to save the extracted text files


# Call the function to process the PDFs
process_pdfs(input_directory, output_directory)

Extracted text saved to: data/extracted_text/EU_AI_Act_English.txt
Extracted text saved to: data/extracted_text/USA_AI_Executive_Order_English.txt


### 2. Cleanup the extracted text

In [5]:
# First the EU Act

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/EU_AI_Act_English.txt", "r", encoding="utf-8") as file:
    text = file.read()

clean_EU_text = clean_EU_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/EU_AI_Act_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_EU_text)


# Second, the USA AI Executive Order

# Read the file content and pass it to clean_legal_text
with open("data/extracted_text/USA_AI_Executive_Order_English.txt", "r", encoding="utf-8") as file:
    text = file.read()
    clean_US_text = clean_US_legal_text(text)

# Save cleaned text to /data/cleaned_text directory
with open("data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_US_text)

### 3. Tokenization

In [6]:
# Paths to the cleaned text files
text_file_1 = "data/cleaned_text/EU_AI_Act_English_Cleaned.txt"
text_file_2 = "data/cleaned_text/USA_AI_Executive_Order_English_Cleaned.txt"

# Load the cleaned texts
text1 = load_text(text_file_1)
text2 = load_text(text_file_2)


# JUSTIFY THE CHOICES
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "ner",
                                                "lemmatizer", "attibute_ruler"]) 
nltk.download("stopwords")
stop_words_en = nltk.corpus.stopwords.words("english")

# Custom tokenization function
def custom_tokenizer(text): # wrap tokenizer in custom function
    tokenized_text = nlp(text)
    # Remove empty tokens
    return [tok.text.strip() for tok in tokenized_text if tok.text.strip() !='']

# Manual tokenization using the custom tokenizer (for inspection purposes)
tokens_EU_AI = custom_tokenizer(text1)
tokens_US_AI = custom_tokenizer(text2)

# Saving the tokens to files for inspection
with open("data/tokens/EU_AI_Act_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_EU_AI:
        f.write(f"{token}\n")

with open("data/tokens/USA_AI_Executive_Order_Tokens.txt", "w", encoding="utf-8") as f:
    for token in tokens_US_AI:
        f.write(f"{token}\n")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.9 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nicolasreichardt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 4. Feature extraction

#### 4.1. Bag of Words (BoW)

In [7]:
# Vectorizer Configuration for BoW (required for LDA)
tokenizer_bow = CountVectorizer(analyzer="word",
                                       tokenizer=custom_tokenizer,
                                       lowercase=True,
                                       stop_words=stop_words_en)
                                      # max_df= 0.80, # Ignore terms that appear in more than 80% of the documents
                                      # min_df= 0.01) # Ignore terms that appear in less than 1% of the documents
text1_bow = tokenizer_bow.fit_transform([text1])
print(f"BoW Matrix Dimensions EU AI Act: {text1_bow.shape}")

text2_bow = tokenizer_bow.fit_transform([text2])
print(f"BoW Matrix Dimensions USA AI Executive Order: {text2_bow.shape}")



BoW Matrix Dimensions EU AI Act: (1, 4823)
BoW Matrix Dimensions USA AI Executive Order: (1, 2933)


#### 4.2. TF-IDF

In [12]:
if text1 and text2:
    # Compute TF-IDF
    feature_names, tfidf_matrix = compute_tfidf([text1, text2])
    
    # Save TF-IDF tokens
    output_directory = "data/tfidf_values"
    save_tfidf_values(output_directory, feature_names, tfidf_matrix)

TF-IDF values saved to: data/tfidf_values/tfidf_doc_1.txt
TF-IDF values saved to: data/tfidf_values/tfidf_doc_2.txt


#### 4.3. Embeddings (Word2Vec)

In [9]:
# Create word2vec embeddings


#### 4.4. (Optional) Text representation through OpenAI contextualised embeddings

### 5. Mathematical analysis

#### 5.1. Similarity Measurement (Cosine Similiarity)

#### 5.2. Dimensionality Reduction (t-SNE and/or PCA)

#### 5.3. Other techniques?

### 5. Topic modeling

#### 5.1. Latent Dirichlet Allocation (LDA)

#### 5.2. BERTopic