# Convert US Congressional bills dataset to Annif corpora

This notebook converts the [bill_labels_us](https://huggingface.co/datasets/dreamproit/bill_labels_us) dataset to corpora suitable for use with the [Annif](https://annif.org) automated subject indexing tool. With these corpora, Annif can be trained to predict either policy areas or legislative subjects based on the title and text of a bill.

In [24]:
from datasets import load_dataset

dataset = load_dataset("dreamproit/bill_labels_us", split='train') # the original dataset only has a "train" split
dataset

Dataset({
    features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'policy_area', 'legislative_subjects'],
    num_rows: 119569
})

In [25]:
# Convert policy areas to Annif CSV vocabulary

import csv

FIELDS = ('uri', 'label_en')
OUTFILE = 'policy_area.csv'

policy_areas = sorted(set(dataset['policy_area']))
policy_area_uri = {}

# This page lists the policy areas. It doesn't have any IDs or HTML anchors to each individual policy area.
# Using a hash URI is the best we can do here, although it won't point to the specific item on the page.
URIBASE = 'https://www.congress.gov/help/field-values/policy-area#'

with open(OUTFILE, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=FIELDS)
    writer.writeheader()

    count = 0
    for idx, policy_area in enumerate(policy_areas):
        uri = URIBASE + str(idx+1)
        item = {
            'uri': uri,
            'label_en': policy_area,
        }
        print(f"{uri}\t{policy_area}")
        policy_area_uri[policy_area] = uri
        writer.writerow(item)
        count += 1

    print(f"wrote {count} policy areas into {OUTFILE}")

https://www.congress.gov/help/field-values/policy-area#1	Agriculture and Food
https://www.congress.gov/help/field-values/policy-area#2	Animals
https://www.congress.gov/help/field-values/policy-area#3	Armed Forces and National Security
https://www.congress.gov/help/field-values/policy-area#4	Arts, Culture, Religion
https://www.congress.gov/help/field-values/policy-area#5	Civil Rights and Liberties, Minority Issues
https://www.congress.gov/help/field-values/policy-area#6	Commemorations
https://www.congress.gov/help/field-values/policy-area#7	Commerce
https://www.congress.gov/help/field-values/policy-area#8	Congress
https://www.congress.gov/help/field-values/policy-area#9	Crime and Law Enforcement
https://www.congress.gov/help/field-values/policy-area#10	Economics and Public Finance
https://www.congress.gov/help/field-values/policy-area#11	Education
https://www.congress.gov/help/field-values/policy-area#12	Emergency Management
https://www.congress.gov/help/field-values/policy-area#13	Ener

In [26]:
# Convert legislative subjects to Annif CSV vocabulary

import csv

FIELDS = ('uri', 'label_en')
OUTFILE = 'legislative_subjects.csv'

subjects = sorted(set([item for row in dataset['legislative_subjects'] for item in row]))
subject_uri = {}

# This page lists the legislative subject terms. It doesn't have any IDs or HTML anchors to each individual subject.
# Using a hash URI is the best we can do here, although it won't point to the specific item on the page.
URIBASE = 'https://www.congress.gov/help/field-values/legislative-subject-terms#'

with open(OUTFILE, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=FIELDS)
    writer.writeheader()

    count = 0
    for idx, subject_term in enumerate(subjects):
        uri = URIBASE + str(idx+1)
        item = {
            'uri': uri,
            'label_en': subject_term,
        }
        if idx < 10:
            print(f"{uri}\t{subject_term}")
        subject_uri[subject_term] = uri
        writer.writerow(item)
        count += 1

    print(f"wrote {count} subjects into {OUTFILE}")


https://www.congress.gov/help/field-values/legislative-subject-terms#1	AIDS (Disease)
https://www.congress.gov/help/field-values/legislative-subject-terms#2	ASEAN countries
https://www.congress.gov/help/field-values/legislative-subject-terms#3	Abandonment of family
https://www.congress.gov/help/field-values/legislative-subject-terms#4	Abortion
https://www.congress.gov/help/field-values/legislative-subject-terms#5	Absentee voting
https://www.congress.gov/help/field-values/legislative-subject-terms#6	Academic freedom
https://www.congress.gov/help/field-values/legislative-subject-terms#7	Academic performance
https://www.congress.gov/help/field-values/legislative-subject-terms#8	Academic performance and assessments
https://www.congress.gov/help/field-values/legislative-subject-terms#9	Access Board
https://www.congress.gov/help/field-values/legislative-subject-terms#10	Access to airports
wrote 4658 subjects into legislative_subjects.csv


In [28]:
# split the dataset into 90% train, 10% test subsets
ds_split = dataset.train_test_split(test_size=0.1)
ds_split

DatasetDict({
    train: Dataset({
        features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'policy_area', 'legislative_subjects'],
        num_rows: 107612
    })
    test: Dataset({
        features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'policy_area', 'legislative_subjects'],
        num_rows: 11957
    })
})

In [46]:
%%time

# create the policy area corpora from the train and test sets

import gzip

def normalize_text(text):
    try:
        return ' '.join(text.split())
    except:
        return ''

def to_policy_area_corpus(ds, outfile):
    for item in ds:
        title = normalize_text(item['title'])
        text = normalize_text(item['text'])
        uri = policy_area_uri[item['policy_area']]

        print(f"{title} ¤ {text}\t<{uri}>", file=outfile)

with gzip.open('policy_area-train.tsv.gz', 'wt') as outfile:
    to_policy_area_corpus(ds_split['train'], outfile)

with gzip.open('policy_area-test.tsv.gz', 'wt') as outfile:
    to_policy_area_corpus(ds_split['test'], outfile)

CPU times: user 2min 51s, sys: 458 ms, total: 2min 51s
Wall time: 2min 51s


In [48]:
%%time

# create the legislative subject corpora from the train and test sets

import gzip

def normalize_text(text):
    try:
        return ' '.join(text.split())
    except:
        return ''

def to_legislative_subject_corpus(ds, outfile):
    for item in ds:
        title = normalize_text(item['title'])
        text = normalize_text(item['text'])
        uris = [f"<{subject_uri[subject]}>"
                for subject in item['legislative_subjects']]

        print(f"{title} ¤ {text}\t{' '.join(uris)}", file=outfile)

with gzip.open('legislative_subject-train.tsv.gz', 'wt') as outfile:
    to_legislative_subject_corpus(ds_split['train'], outfile)

with gzip.open('legislative_subject-test.tsv.gz', 'wt') as outfile:
    to_legislative_subject_corpus(ds_split['test'], outfile)

CPU times: user 3min 52s, sys: 583 ms, total: 3min 53s
Wall time: 3min 53s
