# Importing AGORA dataset

In [1]:
import pandas as pd

In [13]:
df = pd.read_csv('agora/documents.csv')

In [14]:
print(f"This datasets consists of {len(df)} AI-related bills in the USA.")

This datasets consists of 650 AI-related bills in the USA.


In [49]:
def add_full_text(agora_id):
    text = None
    try:
        with open(f'agora/fulltext/{agora_id}.txt', encoding='utf-8') as file:
            text = file.read()
    except FileNotFoundError as e:
        print(e)
    return text

In [50]:
df["full_text"] = df["AGORA ID"].apply(add_full_text)

[Errno 2] No such file or directory: 'agora/fulltext/207.txt'
[Errno 2] No such file or directory: 'agora/fulltext/494.txt'
[Errno 2] No such file or directory: 'agora/fulltext/402.txt'
[Errno 2] No such file or directory: 'agora/fulltext/29.txt'


In [51]:
print("Here is a preview of the first 5 bills in the dataset for reference:\n")
print(df.head(5))

Here is a preview of the first 5 bills in the dataset for reference:

   AGORA ID                                      Official name  \
0       444                              CREATE AI Act of 2023   
1      1723  National Standard of the People’s Republic of ...   
2      1721                              Idaho House Bill 2472   
3       281  A Bill in the District of Columbia to prohibit...   
4       175  Limited Applicability of Consumer Financial Pr...   

                                         Casual name  \
0                                                NaN   
1  Basic Safety Requirements for Generative Artif...   
2  Idaho HB 2472 (Managed Care Reform and Patient...   
3      Stop Discrimination by Algorithms Act of 2023   
4  Limited Applicability of Consumer Financial Pr...   

                                    Link to document  \
0  https://www.congress.gov/bill/118th-congress/s...   
1  https://cset.georgetown.edu/wp-content/uploads...   
2  https://www.ilga.gov/legi

# Pre-processing dataset for LDA/BERTTopic

In [62]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from gensim.models import Phrases
from gensim.models.phrases import Phraser
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emilx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [86]:
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        bigram = Phrases(tokens, min_count=5, threshold=10)
        bigram_mod = Phraser(bigram)
        tokens_with_bigrams = bigram_mod[tokens]
        return ' '.join(tokens_with_bigrams)
    return ''

In [88]:
df["full_text_preprocessed"] = df["full_text"].apply(preprocess_text)

In [90]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
doc_term_matrix = vectorizer.fit_transform(df["full_text_preprocessed"])

In [93]:
vectorizer.get_feature_names_out()

array(['aa', 'ability', 'academic', 'accelerate', 'access', 'accessible',
       'accordance', 'account', 'accountability', 'accuracy', 'achieve',
       'acquisition', 'across', 'act', 'acting', 'action', 'actions',
       'activities', 'activity', 'actors', 'acts', 'actual', 'adding',
       'addition', 'additional', 'address', 'administration',
       'administrative', 'administrator', 'adopt', 'adoption', 'ads',
       'advance', 'advanced', 'advancing', 'adverse', 'advisory',
       'affairs', 'affect', 'affected', 'age', 'agencies', 'agency',
       'agencys', 'agreement', 'agriculture', 'ai', 'air', 'aircraft',
       'algorithm', 'algorithmic', 'algorithms', 'allow', 'also',
       'alternative', 'amended', 'america', 'american', 'among', 'amount',
       'amounts', 'analysis', 'annex', 'annual', 'annually', 'another',
       'applicable', 'application', 'applications', 'applied', 'apply',
       'appointed', 'approach', 'approaches', 'appropriate',
       'appropriated', 'appr

In [96]:
doc_term_matrix.toarray()

array([[0, 0, 0, ..., 0, 5, 0],
       [0, 5, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 6, 1, 1],
       ...,
       [0, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 3, 0]], dtype=int64)

In [97]:
n_topics = 5
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=200,
    learning_method='online',
    random_state=42,
    batch_size=128,
    verbose=0
)

In [98]:
lda_output = lda.fit_transform(doc_term_matrix)

In [99]:
feature_names = vectorizer.get_feature_names_out()

In [101]:
print("\nTop words in each topic:")
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-10-1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"\nTopic {topic_idx + 1}:")
    print(", ".join(top_words))


Top words in each topic:

Topic 1:
section, shall, act, covered, person, information, may, commission, means, individual

Topic 2:
shall, secretary, national, research, section, united, states, including, technology, subsection

Topic 3:
intelligence, artificial, shall, agency, section, defense, use, system, department, data

Topic 4:
ai, data, systems, use, risks, development, system, security, including, model

Topic 5:
shall, ai, article, system, systems, highrisk, regulation, information, providers, relevant


In [102]:
topic_columns = [f'Topic_{i+1}' for i in range(n_topics)]
df_topics = pd.DataFrame(lda_output, columns=topic_columns)
df = pd.concat([df, df_topics], axis=1)

In [105]:
print("\nExample documents for each topic:")
for topic_idx in range(n_topics):
    print(f"\nTop documents for Topic {topic_idx + 1}:")
    top_docs = df.nlargest(3, f'Topic_{topic_idx+1}')
    for idx, row in top_docs.iterrows():
        print(f"\Text: {row['full_text_preprocessed'][:200]}...")
        print(f"Topic {topic_idx + 1} probability: {row[f'Topic_{topic_idx+1}']:.3f}")


Example documents for each topic:

Top documents for Topic 1:
\Text: sec state sports wagering program standards generalthe attorney general shall approve application section unless attorney general determines proposed state sports wagering program meet standards set f...
Topic 1 probability: 0.999
\Text: bill protect intellectual property rights voice visual likeness individuals purposes enacted senate house representatives united states america congress assembled section short title act may cited nur...
Topic 1 probability: 0.999
\Text: section short title act may cited cooper davis actsec reporting requirements electronic communication service providers remote computing services certain controlled substances violationsa amendments c...
Topic 1 probability: 0.999

Top documents for Topic 2:
\Text: h r improve publicprivate partnerships increase federal research development demonstration related evolution next generation pipeline systems purposes house representatives january mr webe