# Importing AGORA dataset

In [2]:
import pandas as pd

In [198]:
df = pd.read_csv('agora/documents.csv')

In [199]:
print(f"This datasets consists of {len(df)} AI-related bills in the USA.")

This datasets consists of 650 AI-related bills in the USA.


In [200]:
def add_full_text(agora_id):
    text = None
    try:
        with open(f'agora/fulltext/{agora_id}.txt', encoding='utf-8') as file:
            text = file.read()
    except FileNotFoundError as e:
        print(e)
    return text

In [201]:
df["full_text"] = df["AGORA ID"].apply(add_full_text)

[Errno 2] No such file or directory: 'agora/fulltext/207.txt'
[Errno 2] No such file or directory: 'agora/fulltext/494.txt'
[Errno 2] No such file or directory: 'agora/fulltext/402.txt'
[Errno 2] No such file or directory: 'agora/fulltext/29.txt'


In [None]:
print("Here is a preview of the first 5 bills in the dataset for reference:\n")
print(df.head(5))

In [204]:
df.to_csv('agora_raw.csv')

# Word Embeddings

In [3]:
df = pd.read_csv('agora_raw.csv')

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk import sent_tokenize
from string import punctuation
translator = str.maketrans('','',punctuation) 
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emilx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def normalize_text(doc):
    "Input doc and return clean list of tokens"
    doc = doc.replace('\r', ' ').replace('\n', ' ')
    lower = doc.lower() # all lower case
    nopunc = lower.translate(translator) # remove punctuation
    words = nopunc.split() # split into tokens
    nostop = [w for w in words if w not in stoplist] # remove stopwords
    no_numbers = [w if not w.isdigit() else '#' for w in nostop] # normalize numbers
    # stemmed = [stemmer.stem(w) for w in no_numbers] # stem each word
    return no_numbers

In [6]:
def get_sentences(doc):
    sent=[]
    for raw in sent_tokenize(doc):
        raw2 = normalize_text(raw)
        sent.append(raw2)
    return sent

In [7]:
sample = list(df["full_text"])

In [8]:
sentences = []
for doc in sample:
    try:
        sentences += get_sentences(doc)
    except:
        pass

In [9]:
from random import shuffle

shuffle(sentences)

In [10]:
# train the model
from gensim.models import Word2Vec
w2v = Word2Vec(sentences,  # list of tokenized sentences
               workers = 8, # Number of threads to run in parallel
               vector_size=300,  # Word vector dimensionality     
               min_count =  25, # Minimum word count  
               window = 5, # Context window size      
               sample = 1e-3, # Downsample setting for frequent words
               )

In [15]:
with open('legal_words.txt') as file:
    common_law_terms = file.read()
common_law_terms = common_law_terms.split(',')

In [17]:
common_law_similar = list()
for term in common_law_terms:
    try:
        for similar_term, _ in w2v.wv.most_similar(term)[:5]:
            common_law_similar.append(similar_term)
    except:
        pass

In [19]:
common_law_terms = common_law_terms + common_law_similar

# Pre-processing dataset

In [21]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from gensim.models import Phrases
from gensim.models.phrases import Phraser
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emilx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in list(stop_words) + common_law_terms]
        bigram = Phrases(tokens, min_count=5, threshold=10)
        bigram_mod = Phraser(bigram)
        tokens_with_bigrams = bigram_mod[tokens]
        return ' '.join(tokens_with_bigrams)
    return ''

In [26]:
df["full_text_preprocessed"] = df["full_text"].apply(preprocess_text)

In [28]:
df.to_csv("agora_processed.csv")

### LDA

In [4]:
df = pd.read_csv("agora_processed.csv")

In [29]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
doc_term_matrix = vectorizer.fit_transform(df["full_text_preprocessed"])

In [30]:
vectorizer.get_feature_names_out()

array(['ability', 'academia', 'academic', 'accelerate', 'access',
       'accessible', 'accordance', 'account', 'accountability',
       'accuracy', 'achieve', 'acquisition', 'across', 'acting', 'action',
       'actions', 'activities', 'activity', 'actors', 'acts', 'actual',
       'added', 'adding', 'addition', 'additional', 'address',
       'administration', 'administrative', 'administrator', 'adopt',
       'adoption', 'ads', 'advance', 'advanced', 'advancing', 'adverse',
       'affairs', 'affect', 'affected', 'age', 'agencies', 'agency',
       'agencys', 'agreement', 'agreements', 'agriculture', 'ai', 'air',
       'aircraft', 'algorithm', 'algorithmic', 'algorithms', 'allow',
       'alternative', 'amended', 'america', 'american', 'among', 'amount',
       'amounts', 'analysis', 'annex', 'annual', 'another', 'applicable',
       'applicant', 'application', 'applications', 'applied', 'apply',
       'appointed', 'approach', 'approaches', 'appropriate',
       'appropriated', 'a

In [31]:
doc_term_matrix.toarray()

array([[0, 1, 0, ..., 0, 5, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 6, 1, 1],
       ...,
       [2, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 3, 0]], dtype=int64)

In [32]:
n_topics = 3
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=200,
    learning_method='online',
    random_state=42,
    batch_size=128,
    verbose=0
)

In [33]:
lda_output = lda.fit_transform(doc_term_matrix)

In [34]:
feature_names = vectorizer.get_feature_names_out()

In [35]:
print("\nTop words in each topic:")
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-10-1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"\nTopic {topic_idx + 1}:")
    print(", ".join(top_words))


Top words in each topic:

Topic 1:
information, covered, state, person, commission, means, including, individual, service, data

Topic 2:
ai, systems, data, use, system, risks, security, development, including, safety

Topic 3:
secretary, national, including, subsection, director, research, technology, program, defense, paragraph


In [42]:
topic_columns = [f'Topic_{i+1}' for i in range(n_topics)]
df_topics = pd.DataFrame(lda_output, columns=topic_columns)
df = pd.concat([df, df_topics], axis=1)

In [43]:
print("\nExample documents for each topic:")
for topic_idx in range(n_topics):
    print(f"\nTop documents for Topic {topic_idx + 1}:")
    top_docs = df.nlargest(3, f'Topic_{topic_idx+1}')
    for idx, row in top_docs.iterrows():
        print(f"\Text: {row['full_text_preprocessed'][:200]}...")
        print(f"Topic {topic_idx + 1} probability: {row[f'Topic_{topic_idx+1}']:.3f}")


Example documents for each topic:

Top documents for Topic 1:
\Text: enacted state utah b amended read b definitions used chapter child sexual abuse material means visual depiction including live performance photograph film video picture computer computergenerated imag...
Topic 1 probability: 0.999
\Text: relating elections amending enacting sections campaign reporting adding disclaimer requirements advertisements containing materially deceptive media creating crime distributing entering agreement anot...
Topic 1 probability: 0.999
\Text: short title cited tools address known exploitation immobilizing technological deepfakes websites networks take sec criminal prohibition intentional disclosure nonconsensual intimate visual depictions ...
Topic 1 probability: 0.999

Top documents for Topic 2:
\Text: hiroshima process international code conduct organizations developing advanced ai systems basis international guiding principles organizations developing advanced ai systems international 

### BERTopic

In [36]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

  from .autonotebook import tqdm as notebook_tqdm


In [55]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', prediction_data=True)
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=10,     # Merges tiny topics into larger ones, when set to 20 only 2 topics
    verbose=True,
    calculate_probabilities=True
)

In [56]:
topics, probs = topic_model.fit_transform(df['full_text_preprocessed'])

2025-05-28 12:20:26,124 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 21/21 [01:27<00:00,  4.14s/it]
2025-05-28 12:21:54,834 - BERTopic - Embedding - Completed ✓
2025-05-28 12:21:54,837 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-28 12:21:58,363 - BERTopic - Dimensionality - Completed ✓
2025-05-28 12:21:58,366 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-28 12:21:58,557 - BERTopic - Cluster - Completed ✓
2025-05-28 12:21:58,572 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-05-28 12:22:00,691 - BERTopic - Representation - Completed ✓


In [63]:
print("\nTop words for each topic:")
for topic_id in topic_model.get_topics():
    if topic_id != -1:  # Skip the outlier topic (-1)
        words = topic_model.get_topic(topic_id)
        print(f"\nTopic {topic_id}:")
        print(", ".join([word for word, _ in words[:10]]))  # Show top 10 words


Top words for each topic:

Topic 0:
defense, education, director, program, research, secretary, subsection, national, including, paragraph

Topic 1:
ai, systems, system, data, use, article, security, risks, model, development

Topic 2:
energy, research, secretary, national, program, including, weather, development, technologies, data

Topic 3:
person, individual, election, media, image, audio, means, visual, sexual, video

Topic 4:
automated, system, data, decision, information, use, state, employer, used, systems

Topic 5:
covered, foreign, entity, president, secretary, security, regulations, term, national, activity

Topic 6:
health, care, plan, services, medical, patient, ai, use, program, benefits

Topic 7:
commission, agency, council, data, digital, criticalimpact, chief, government, state, including


In [97]:
BERT_probs = pd.DataFrame(probs, columns=['BERT_topic0', 'BERT_topic1', 'BERT_topic2', 'BERT_topic3', 'BERT_topic4', 'BERT_topic5', 'BERT_topic6', 'BERT_topic7'])
df = pd.concat([df, BERT_probs], axis=1)

In [98]:
dummy_topic_indices = np.where(np.array(topics) == -1)[0]
dummy_topic_indices

array([  4,   9,  20,  22,  25,  26,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  53,  54,  56,  82,  83,  91,  92,  93,
       101, 105, 107, 116, 117, 123, 132, 137, 143, 149, 167, 174, 175,
       176, 178, 180, 181, 182, 183, 184, 185, 187, 188, 189, 195, 198,
       210, 218, 220, 222, 225, 229, 232, 234, 238, 239, 243, 246, 248,
       251, 252, 253, 261, 269, 271, 272, 283, 290, 291, 293, 296, 297,
       298, 301, 302, 306, 307, 310, 315, 316, 317, 318, 321, 324, 325,
       337, 340, 341, 343, 344, 353, 357, 358, 361, 373, 374, 381, 386,
       390, 399, 400, 402, 411, 414, 416, 417, 423, 425, 427, 430, 432,
       437, 442, 443, 448, 450, 452, 454, 456, 460, 462, 464, 467, 470,
       472, 474, 476, 478, 479, 486, 488, 490, 495, 498, 499, 502, 503,
       504, 505, 507, 508, 509, 514, 515, 516, 517, 518, 519, 527, 532,
       533, 534, 535, 537, 547, 548, 552, 554, 555, 558, 559, 566, 568,
       569, 570, 582, 583, 584, 585, 586, 588, 589, 600, 602, 60

In [108]:
df = df.drop(dummy_topic_indices)
df = df.reset_index(drop=True)

In [123]:
df.to_csv('agora_topic_probabilities.csv')