# Importing AGORA dataset

In [65]:
import pandas as pd

In [66]:
df = pd.read_csv('agora/documents.csv')

In [67]:
print(f"This datasets consists of {len(df)} AI-related bills in the USA.")

This datasets consists of 650 AI-related bills in the USA.


In [68]:
def add_full_text(agora_id):
    text = None
    try:
        with open(f'agora/fulltext/{agora_id}.txt', encoding='utf-8') as file:
            text = file.read()
    except FileNotFoundError as e:
        print(e)
    return text

In [69]:
df["full_text"] = df["AGORA ID"].apply(add_full_text)

[Errno 2] No such file or directory: 'agora/fulltext/207.txt'
[Errno 2] No such file or directory: 'agora/fulltext/494.txt'
[Errno 2] No such file or directory: 'agora/fulltext/402.txt'
[Errno 2] No such file or directory: 'agora/fulltext/29.txt'


In [70]:
print("Here is a preview of the first 5 bills in the dataset for reference:\n")
print(df.head(5))

Here is a preview of the first 5 bills in the dataset for reference:

   AGORA ID                                      Official name  \
0       444                              CREATE AI Act of 2023   
1      1723  National Standard of the People’s Republic of ...   
2      1721                              Idaho House Bill 2472   
3       281  A Bill in the District of Columbia to prohibit...   
4       175  Limited Applicability of Consumer Financial Pr...   

                                         Casual name  \
0                                                NaN   
1  Basic Safety Requirements for Generative Artif...   
2  Idaho HB 2472 (Managed Care Reform and Patient...   
3      Stop Discrimination by Algorithms Act of 2023   
4  Limited Applicability of Consumer Financial Pr...   

                                    Link to document  \
0  https://www.congress.gov/bill/118th-congress/s...   
1  https://cset.georgetown.edu/wp-content/uploads...   
2  https://www.ilga.gov/legi

In [71]:
df.to_csv('agora_raw.csv')

# Word Embeddings

In [72]:
df = pd.read_csv('agora_raw.csv')

In [73]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk import sent_tokenize
from string import punctuation
translator = str.maketrans('','',punctuation) 
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noahfehr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/noahfehr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [74]:
def normalize_text(doc):
    "Input doc and return clean list of tokens"
    doc = doc.replace('\r', ' ').replace('\n', ' ')
    lower = doc.lower() # all lower case
    nopunc = lower.translate(translator) # remove punctuation
    words = nopunc.split() # split into tokens
    nostop = [w for w in words if w not in stoplist] # remove stopwords
    no_numbers = [w if not w.isdigit() else '#' for w in nostop] # normalize numbers
    # stemmed = [stemmer.stem(w) for w in no_numbers] # stem each word
    return no_numbers

In [75]:
def get_sentences(doc):
    sent=[]
    for raw in sent_tokenize(doc):
        raw2 = normalize_text(raw)
        sent.append(raw2)
    return sent

In [76]:
sample = list(df["full_text"])

In [77]:
sentences = []
for doc in sample:
    try:
        sentences += get_sentences(doc)
    except:
        pass

In [78]:
from random import shuffle

shuffle(sentences)

In [79]:
# train the model
from gensim.models import Word2Vec
w2v = Word2Vec(sentences,  # list of tokenized sentences
               workers = 8, # Number of threads to run in parallel
               vector_size=300,  # Word vector dimensionality     
               min_count =  25, # Minimum word count  
               window = 5, # Context window size      
               sample = 1e-3, # Downsample setting for frequent words
               )

In [80]:
with open('legal_words.txt') as file:
    common_law_terms = file.read()
common_law_terms = common_law_terms.split(',')

In [81]:
common_law_similar = list()
for term in common_law_terms:
    try:
        for similar_term, _ in w2v.wv.most_similar(term)[:5]:
            common_law_similar.append(similar_term)
    except:
        pass

In [82]:
common_law_terms = common_law_terms + common_law_similar

# Pre-processing dataset

In [83]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from gensim.models import Phrases
from gensim.models.phrases import Phraser
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/noahfehr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noahfehr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in list(stop_words) + common_law_terms]
        bigram = Phrases(tokens, min_count=5, threshold=10)
        bigram_mod = Phraser(bigram)
        tokens_with_bigrams = bigram_mod[tokens]
        return ' '.join(tokens_with_bigrams)
    return ''

In [85]:
df["full_text_preprocessed"] = df["full_text"].apply(preprocess_text)

KeyboardInterrupt: 

In [42]:
df.to_csv("agora_processed.csv")

### LDA

In [43]:
df = pd.read_csv("agora_processed.csv")

In [44]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
doc_term_matrix = vectorizer.fit_transform(df["full_text_preprocessed"])

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [30]:
vectorizer.get_feature_names_out()

array(['ability', 'academia', 'academic', 'accelerate', 'access',
       'accessible', 'accordance', 'account', 'accountability',
       'accuracy', 'achieve', 'acquisition', 'across', 'acting', 'action',
       'actions', 'activities', 'activity', 'actors', 'acts', 'actual',
       'added', 'adding', 'addition', 'additional', 'address',
       'administration', 'administrative', 'administrator', 'adopt',
       'adoption', 'ads', 'advance', 'advanced', 'advancing', 'adverse',
       'affairs', 'affect', 'affected', 'age', 'agencies', 'agency',
       'agencys', 'agreement', 'agreements', 'agriculture', 'ai', 'air',
       'aircraft', 'algorithm', 'algorithmic', 'algorithms', 'allow',
       'alternative', 'amended', 'america', 'american', 'among', 'amount',
       'amounts', 'analysis', 'annex', 'annual', 'another', 'applicable',
       'applicant', 'application', 'applications', 'applied', 'apply',
       'appointed', 'approach', 'approaches', 'appropriate',
       'appropriated', 'a

In [31]:
doc_term_matrix.toarray()

array([[0, 1, 0, ..., 0, 5, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 6, 1, 1],
       ...,
       [2, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 3, 0]], dtype=int64)

In [32]:
n_topics = 3
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=200,
    learning_method='online',
    random_state=42,
    batch_size=128,
    verbose=0
)

In [33]:
lda_output = lda.fit_transform(doc_term_matrix)

In [34]:
feature_names = vectorizer.get_feature_names_out()

In [35]:
print("\nTop words in each topic:")
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-10-1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"\nTopic {topic_idx + 1}:")
    print(", ".join(top_words))


Top words in each topic:

Topic 1:
information, covered, state, person, commission, means, including, individual, service, data

Topic 2:
ai, systems, data, use, system, risks, security, development, including, safety

Topic 3:
secretary, national, including, subsection, director, research, technology, program, defense, paragraph


In [42]:
topic_columns = [f'Topic_{i+1}' for i in range(n_topics)]
df_topics = pd.DataFrame(lda_output, columns=topic_columns)
df = pd.concat([df, df_topics], axis=1)

In [43]:
print("\nExample documents for each topic:")
for topic_idx in range(n_topics):
    print(f"\nTop documents for Topic {topic_idx + 1}:")
    top_docs = df.nlargest(3, f'Topic_{topic_idx+1}')
    for idx, row in top_docs.iterrows():
        print(f"\Text: {row['full_text_preprocessed'][:200]}...")
        print(f"Topic {topic_idx + 1} probability: {row[f'Topic_{topic_idx+1}']:.3f}")


Example documents for each topic:

Top documents for Topic 1:
\Text: enacted state utah b amended read b definitions used chapter child sexual abuse material means visual depiction including live performance photograph film video picture computer computergenerated imag...
Topic 1 probability: 0.999
\Text: relating elections amending enacting sections campaign reporting adding disclaimer requirements advertisements containing materially deceptive media creating crime distributing entering agreement anot...
Topic 1 probability: 0.999
\Text: short title cited tools address known exploitation immobilizing technological deepfakes websites networks take sec criminal prohibition intentional disclosure nonconsensual intimate visual depictions ...
Topic 1 probability: 0.999

Top documents for Topic 2:
\Text: hiroshima process international code conduct organizations developing advanced ai systems basis international guiding principles organizations developing advanced ai systems international 

### BERTopic

In [36]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

  from .autonotebook import tqdm as notebook_tqdm


In [55]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', prediction_data=True)
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=10,     # Merges tiny topics into larger ones, when set to 20 only 2 topics
    verbose=True,
    calculate_probabilities=True
)

In [56]:
topics, probs = topic_model.fit_transform(df['full_text_preprocessed'])

2025-05-28 12:20:26,124 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 21/21 [01:27<00:00,  4.14s/it]
2025-05-28 12:21:54,834 - BERTopic - Embedding - Completed ✓
2025-05-28 12:21:54,837 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-28 12:21:58,363 - BERTopic - Dimensionality - Completed ✓
2025-05-28 12:21:58,366 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-28 12:21:58,557 - BERTopic - Cluster - Completed ✓
2025-05-28 12:21:58,572 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-05-28 12:22:00,691 - BERTopic - Representation - Completed ✓


In [63]:
print("\nTop words for each topic:")
for topic_id in topic_model.get_topics():
    if topic_id != -1:  # Skip the outlier topic (-1)
        words = topic_model.get_topic(topic_id)
        print(f"\nTopic {topic_id}:")
        print(", ".join([word for word, _ in words[:10]]))  # Show top 10 words


Top words for each topic:

Topic 0:
defense, education, director, program, research, secretary, subsection, national, including, paragraph

Topic 1:
ai, systems, system, data, use, article, security, risks, model, development

Topic 2:
energy, research, secretary, national, program, including, weather, development, technologies, data

Topic 3:
person, individual, election, media, image, audio, means, visual, sexual, video

Topic 4:
automated, system, data, decision, information, use, state, employer, used, systems

Topic 5:
covered, foreign, entity, president, secretary, security, regulations, term, national, activity

Topic 6:
health, care, plan, services, medical, patient, ai, use, program, benefits

Topic 7:
commission, agency, council, data, digital, criticalimpact, chief, government, state, including


In [17]:
BERT_probs = pd.DataFrame(probs, columns=['BERT_topic0', 'BERT_topic1', 'BERT_topic2', 'BERT_topic3', 'BERT_topic4', 'BERT_topic5', 'BERT_topic6', 'BERT_topic7'])
df = pd.concat([df, BERT_probs], axis=1)

NameError: name 'probs' is not defined

In [98]:
dummy_topic_indices = np.where(np.array(topics) == -1)[0]
dummy_topic_indices

array([  4,   9,  20,  22,  25,  26,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  53,  54,  56,  82,  83,  91,  92,  93,
       101, 105, 107, 116, 117, 123, 132, 137, 143, 149, 167, 174, 175,
       176, 178, 180, 181, 182, 183, 184, 185, 187, 188, 189, 195, 198,
       210, 218, 220, 222, 225, 229, 232, 234, 238, 239, 243, 246, 248,
       251, 252, 253, 261, 269, 271, 272, 283, 290, 291, 293, 296, 297,
       298, 301, 302, 306, 307, 310, 315, 316, 317, 318, 321, 324, 325,
       337, 340, 341, 343, 344, 353, 357, 358, 361, 373, 374, 381, 386,
       390, 399, 400, 402, 411, 414, 416, 417, 423, 425, 427, 430, 432,
       437, 442, 443, 448, 450, 452, 454, 456, 460, 462, 464, 467, 470,
       472, 474, 476, 478, 479, 486, 488, 490, 495, 498, 499, 502, 503,
       504, 505, 507, 508, 509, 514, 515, 516, 517, 518, 519, 527, 532,
       533, 534, 535, 537, 547, 548, 552, 554, 555, 558, 559, 566, 568,
       569, 570, 582, 583, 584, 585, 586, 588, 589, 600, 602, 60

In [108]:
df = df.drop(dummy_topic_indices)
df = df.reset_index(drop=True)

In [123]:
df.to_csv('agora_topic_probabilities.csv')

In [147]:
df = pd.read_csv('intermediate_data/agora_topic_probabilities.csv')
cols_to_drop = [
    'Unnamed: 0', 'AGORA ID', 'Official name', 'Casual name', 'Link to document', 'Collections',
    'full_text_preprocessed', 'full_text', 'Tags', 'Short summary', 'Long summary', 'Summaries and tags may include unreviewed machine output',
    'Official plaintext retrieved', 'Official plaintext source', 'Official plaintext unavailable/infeasible', 'Official pdf source', 'Official pdf retrieved'
]
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
print(df['Authority'].unique())

authorities_to_drop = [
    'Government of Israel', 'Government of New Zealand', 'Government of Canada',
    'Government of Australia', 'Government of the United Kingdom',
    'Chinese central government', 'Chinese provincial and local governments',
    'European Union', 'United Nations', 'OECD', 'Other multinational'
]
df = df[~df['Authority'].isin(authorities_to_drop)]

federal_legislative = ['United States Congress']
federal_executive = [
    'Executive Office of the President', 'Department of Defense',
    'Department of Commerce', 'Department of Agriculture',
    'Department of Health and Human Services', 'Department of Education',
    'Office of Management and Budget', 'Federal Election Commission',
    'National Institute of Standards and Technology',
    'Copyright Office, Library of Congress'
]
us_states = [
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
    'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
    'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
    'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
    'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
    'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
    'West Virginia', 'Wisconsin', 'Wyoming', 'District of Columbia'
]
def categorize(entity):
    if entity in federal_legislative:
        return 'federal_legislative'
    elif entity in federal_executive:
        return 'federal_executive'
    elif entity in us_states:
        return entity
    else:
        return None


df['gov_category'] = df['Authority'].apply(categorize)
df = df[df['gov_category'].notna()]
df['gov_category'].value_counts()



['United States Congress' 'Other authorities' 'Idaho'
 'District of Columbia' 'Government of Israel' 'Government of New Zealand'
 'United Nations' 'California' 'Illinois' 'Utah' 'Arizona'
 'Other multinational' 'Indiana' 'Mississippi' 'New York' 'West Virginia'
 'Tennessee' 'European Union' 'Chinese central government'
 'Private-sector companies' 'Executive Office of the President' 'Colorado'
 'Washington' 'Nebraska' 'Maryland'
 'Chinese provincial and local governments' 'Government of Canada'
 'South Dakota' 'Alabama' 'New Hampshire' 'Connecticut' 'Oregon' 'Hawaii'
 'Iowa' 'Florida' 'Louisiana' 'Department of Defense'
 'Government of Australia' 'Office of Management and Budget' 'Wisconsin'
 'New Mexico' 'Minnesota' 'Michigan'
 'National Institute of Standards and Technology'
 'Government of the United Kingdom' 'Department of Commerce'
 'Federal Election Commission' 'OECD' 'Texas' 'Massachusetts'
 'Rhode Island' 'Arkansas' 'Virginia' 'New Jersey' 'North Carolina'
 'Pennsylvania' 'Copyr

gov_category
federal_legislative     294
federal_executive        19
California               14
New York                  9
New Jersey                4
Utah                      4
Arizona                   3
Massachusetts             3
Tennessee                 3
Pennsylvania              3
Idaho                     3
New Hampshire             3
Rhode Island              2
Alabama                   2
Texas                     2
Michigan                  2
Florida                   2
Hawaii                    2
Connecticut               2
Colorado                  2
Illinois                  2
Indiana                   2
Mississippi               2
West Virginia             1
North Dakota              1
District of Columbia      1
North Carolina            1
Virginia                  1
Arkansas                  1
South Dakota              1
Maryland                  1
Minnesota                 1
New Mexico                1
Wisconsin                 1
Louisiana                 1
Washing

In [148]:
blue_states = [
    'California', 'Connecticut', 'Delaware', 'Hawaii', 'Illinois', 'Maine',
    'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'New Jersey',
    'New Mexico', 'New York', 'Oregon', 'Rhode Island', 'Vermont',
    'Washington', 'Colorado', 'Nevada', 'District of Columbia'
]

red_states = [
    'Alabama', 'Arkansas', 'Idaho', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
    'Louisiana', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
    'North Dakota', 'Oklahoma', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'West Virginia', 'Wyoming'
]

purple_states = [
    'Arizona', 'Florida', 'Georgia', 'New Hampshire', 'North Carolina',
    'Ohio', 'Pennsylvania', 'Virginia', 'Wisconsin'  
]

def color_label(entity):
    if entity in blue_states:
        return 'blue_state'
    elif entity in red_states:
        return 'red_state'
    elif entity in purple_states:
        return 'purple_state'
    else:
        return entity 

df['gov_category_consolidated'] = df['gov_category'].apply(color_label)
df['gov_category_consolidated'].value_counts()


gov_category_consolidated
federal_legislative    294
blue_state              48
red_state               26
federal_executive       19
purple_state            14
Name: count, dtype: int64

In [149]:
df = df.drop(columns=['Authority', 'LDA_topic1',
       'LDA_topic2', 'LDA_topic3','gov_category'], errors='ignore')

df = df.drop(columns=['Proposed date'], errors='ignore')
df['enacted_binary'] = (df['Most recent activity'] == 'Enacted').astype(int)
df = df.drop(columns=['Most recent activity', 'Validated?'], errors='ignore')
df.columns





Index(['Most recent activity date', 'Annotated?',
       'Primarily applies to the government',
       'Primarily applies to the private sector', 'Number of segments created',
       'Applications: Agriculture and resource extraction',
       'Applications: Arts, sports, leisure, travel, and lifestyle',
       'Applications: Broadcasting and media production',
       'Applications: Business services and analytics',
       'Applications: Construction and field services',
       'Applications: Consumer goods', 'Applications: Education',
       'Applications: Energy and utilities',
       'Applications: Finance and investment',
       'Applications: Government: benefits and welfare',
       'Applications: Government: judicial and law enforcement',
       'Applications: Government: military and public safety',
       'Applications: Government: other applications/unspecified',
       'Applications: Manufacturing and process automation',
       'Applications: Medicine, life sciences and publ

In [151]:
# Separate the column you want to exclude
date_col = df[['Most recent activity date']]

# Apply get_dummies to the rest
df_encoded = pd.get_dummies(df.drop(columns=['Most recent activity date']))

# Concatenate back the excluded column
df = pd.concat([df_encoded, date_col], axis=1)
# Convert all bool columns to 0/1 integers
for col in df_encoded.select_dtypes(include='bool').columns:
    df_encoded[col] = df_encoded[col].astype(int)
df = pd.concat([df_encoded, date_col], axis=1)
df['enacted_binary'].value_counts()

enacted_binary
0    229
1    172
Name: count, dtype: int64

In [164]:
# Calculate days after 2019-02-11 for each date in 'Most recent activity date'
df['datedummy'] = (df['Most recent activity date'].astype('datetime64[ns]') - pd.to_datetime('2019-02-11')).dt.days
df.columns

Index(['Annotated?', 'Primarily applies to the government',
       'Primarily applies to the private sector', 'Number of segments created',
       'Applications: Agriculture and resource extraction',
       'Applications: Arts, sports, leisure, travel, and lifestyle',
       'Applications: Broadcasting and media production',
       'Applications: Business services and analytics',
       'Applications: Construction and field services',
       'Applications: Consumer goods', 'Applications: Education',
       'Applications: Energy and utilities',
       'Applications: Finance and investment',
       'Applications: Government: benefits and welfare',
       'Applications: Government: judicial and law enforcement',
       'Applications: Government: military and public safety',
       'Applications: Government: other applications/unspecified',
       'Applications: Manufacturing and process automation',
       'Applications: Medicine, life sciences and public health',
       'Applications: Ne

In [175]:
df = df.drop(columns=['Most recent activity date'], errors='ignore')
df

Unnamed: 0,Primarily applies to the government,Primarily applies to the private sector,Applications: Agriculture and resource extraction,"Applications: Arts, sports, leisure, travel, and lifestyle",Applications: Broadcasting and media production,Applications: Business services and analytics,Applications: Construction and field services,Applications: Consumer goods,Applications: Education,Applications: Energy and utilities,...,BERT_topic5,BERT_topic6,BERT_topic7,enacted_binary,gov_category_consolidated_blue_state,gov_category_consolidated_federal_executive,gov_category_consolidated_federal_legislative,gov_category_consolidated_purple_state,gov_category_consolidated_red_state,datedummy
0,0,0,0,0,0,0,0,0,0,0,...,1.493676e-01,7.192617e-02,1.225086e-01,0,0,0,1,0,0,1627
2,0,0,0,0,0,0,0,0,0,0,...,1.251081e-308,1.000000e+00,1.897701e-308,1,0,0,0,0,1,1985
3,0,1,0,0,0,1,0,0,0,0,...,1.766464e-02,3.240372e-02,3.911828e-02,0,1,0,0,0,0,1460
7,0,0,0,0,0,0,0,0,0,0,...,3.692844e-02,5.720213e-02,6.529173e-02,1,1,0,0,0,0,2057
8,0,0,0,0,0,0,0,0,0,0,...,2.029906e-308,2.304330e-308,1.000000e+00,1,1,0,0,0,0,1962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,1,0,0,0,0,0,0,0,0,0,...,6.847546e-02,3.780430e-02,9.176968e-02,1,0,0,1,0,0,1050
451,1,0,0,0,0,0,0,0,0,0,...,1.031788e-01,5.254648e-02,1.384755e-01,1,0,0,1,0,0,1050
452,1,0,0,0,0,0,0,0,0,0,...,1.205962e-01,6.213679e-02,1.266515e-01,1,0,0,1,0,0,1050
453,1,0,0,0,0,0,0,0,0,0,...,1.266922e-01,7.054527e-02,1.175328e-01,1,0,0,1,0,0,1050


In [179]:
df = df.drop(columns=['Annotated?', 'Number of segments created'], errors='ignore')
# Drop the 'federal_executive' dummy to use as baseline
if 'gov_category_consolidated_federal_executive' in df.columns:
    df = df.drop(columns=['gov_category_consolidated_federal_executive'])
# Drop primarily applies to the government (dummy with private secotr)
if 'Primarily applies to the government' in df.columns:
    df = df.drop(columns=['Primarily applies to the government'])
df


Unnamed: 0,Primarily applies to the private sector,Applications: Agriculture and resource extraction,"Applications: Arts, sports, leisure, travel, and lifestyle",Applications: Broadcasting and media production,Applications: Business services and analytics,Applications: Construction and field services,Applications: Consumer goods,Applications: Education,Applications: Energy and utilities,Applications: Finance and investment,...,BERT_topic4,BERT_topic5,BERT_topic6,BERT_topic7,enacted_binary,gov_category_consolidated_blue_state,gov_category_consolidated_federal_legislative,gov_category_consolidated_purple_state,gov_category_consolidated_red_state,datedummy
0,0,0,0,0,0,0,0,0,0,0,...,7.718589e-02,1.493676e-01,7.192617e-02,1.225086e-01,0,0,1,0,0,1627
2,0,0,0,0,0,0,0,0,0,0,...,2.261265e-308,1.251081e-308,1.000000e+00,1.897701e-308,1,0,0,0,1,1985
3,1,0,0,0,1,0,0,0,0,1,...,8.199923e-01,1.766464e-02,3.240372e-02,3.911828e-02,0,1,0,0,0,1460
7,0,0,0,0,0,0,0,0,0,0,...,1.148070e-01,3.692844e-02,5.720213e-02,6.529173e-02,1,1,0,0,0,2057
8,0,0,0,0,0,0,0,0,0,0,...,4.104355e-308,2.029906e-308,2.304330e-308,1.000000e+00,1,1,0,0,0,1962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,0,0,0,0,0,0,0,0,0,0,...,3.702525e-02,6.847546e-02,3.780430e-02,9.176968e-02,1,0,1,0,0,1050
451,0,0,0,0,0,0,0,0,0,0,...,5.121321e-02,1.031788e-01,5.254648e-02,1.384755e-01,1,0,1,0,0,1050
452,0,0,0,0,0,0,0,0,0,0,...,6.821957e-02,1.205962e-01,6.213679e-02,1.266515e-01,1,0,1,0,0,1050
453,0,0,0,0,0,0,0,0,0,0,...,6.611540e-02,1.266922e-01,7.054527e-02,1.175328e-01,1,0,1,0,0,1050


In [190]:
import statsmodels.api as sm
import numpy as np

# Define y and X
y = df['enacted_binary']

# Drop y and any non-numeric columns from X
X = df.drop(columns=['enacted_binary'])

# Ensure all data is numeric and no object dtype remains
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# # Fit a simple linear regression model
# X_with_const = sm.add_constant(X)
# model = sm.OLS(y, X_with_const, missing='drop')
# results = model.fit()
# print(results.summary())

# # Print columns with |t| > 2 (excluding the constant)
# t_stats = results.tvalues.drop('const', errors='ignore')
# cols_over_2 = t_stats[abs(t_stats) > 2].index.tolist()
# print("Columns with |t| > 2:")
# for col in cols_over_2:
#     print(col)

# Run a linear regression excluding columns that start with 'Strategies' or 'Applications'
exclude_prefixes = ('Strategies', 'Applications', 'Risk factors', 'Incentives', 'Harms')
cols_to_exclude = [col for col in X.columns if col.startswith(exclude_prefixes)]
X_no_strat_app = X.drop(columns=cols_to_exclude, errors='ignore')

# Add constant and fit the model
X_no_strat_app_const = sm.add_constant(X_no_strat_app)
model_no_strat_app = sm.OLS(y, X_no_strat_app_const, missing='drop')
results_no_strat_app = model_no_strat_app.fit()
print("\nLinear regression excluding all of the content-related columns:")
print(results_no_strat_app.summary())



Linear regression excluding all of the content-related columns:
                            OLS Regression Results                            
Dep. Variable:         enacted_binary   R-squared:                       0.528
Model:                            OLS   Adj. R-squared:                  0.511
Method:                 Least Squares   F-statistic:                     30.87
Date:                Thu, 29 May 2025   Prob (F-statistic):           1.91e-54
Time:                        12:38:57   Log-Likelihood:                -136.32
No. Observations:                 401   AIC:                             302.6
Df Residuals:                     386   BIC:                             362.5
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [193]:
# Add interaction terms between 'datedummy' and all gov_category variables
import re

logit_model = sm.Logit(y, X_no_strat_app_const, missing='drop')
logit_results = logit_model.fit()
print("\nLogit regression excluding all of the content-related columns, with interaction terms for datedummy on all gov_category vars:")
print(logit_results.summary())

# Identify significant variables (p < 0.05) from the previous logit regression
signif_vars = logit_results.pvalues[logit_results.pvalues < 0.1].index.tolist()
# Remove 'const' if present
signif_vars = [var for var in signif_vars if var != 'const']

if signif_vars:
    X_signif = sm.add_constant(X_no_strat_app[signif_vars])
    logit_model_signif = sm.Logit(y, X_signif, missing='drop')
    logit_results_signif = logit_model_signif.fit()
    print("\nLogit regression using only significant variables (p < 0.05):")
    print(logit_results_signif.summary())
else:
    print("No significant variables (p < 0.05) found in the previous logit regression.")


Optimization terminated successfully.
         Current function value: 0.318874
         Iterations 8

Logit regression excluding all of the content-related columns, with interaction terms for datedummy on all gov_category vars:
                           Logit Regression Results                           
Dep. Variable:         enacted_binary   No. Observations:                  401
Model:                          Logit   Df Residuals:                      386
Method:                           MLE   Df Model:                           14
Date:                Thu, 29 May 2025   Pseudo R-squ.:                  0.5331
Time:                        12:46:37   Log-Likelihood:                -127.87
converged:                       True   LL-Null:                       -273.89
Covariance Type:            nonrobust   LLR p-value:                 5.396e-54
                                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------