Import some data from govtrack.us.

We will create a file on Colab's hard drive. There are more places to load data from: https://neptune.ai/blog/google-colab-dealing-with-files

In [None]:
import csv
import os
from typing import Tuple

import bs4
import requests
# "https://www.govtrack.us/congress/bills/116/hr221/summary"
# "https://www.govtrack.us/congress/bills/116/hr221"

NO_SUCH_BILL = "NO SUCH BILL"


def get_summary(bill_id: int) -> str:
    summary_url = f"https://www.govtrack.us/congress/bills/116/hr{bill_id}/summary"
    summary_html = requests.get(summary_url)
    if summary_html.status_code == 404:
        return NO_SUCH_BILL

    soup = bs4.BeautifulSoup(summary_html.text, 'html.parser')
    summary = soup.find("div", {"id": "libraryofcongress"})
    summary_parts = []
    if not hasattr(summary, "contents"):
        print()
    if len(summary.contents) < 4:
        if "No summary available." in summary.text:
            return "No summary available."
        print(summary.text)
        raise TypeError('uh oh')

    for element in summary.contents[3]:
        if "<script>" in str(element):
            continue
        if not hasattr(element, "text"):
            continue
        text = element.text
        if text.strip():
            summary_parts.append(text)
    return "\n".join(summary_parts)


def get_status(bill_id: int) -> str:
    status_url = f"https://www.govtrack.us/congress/bills/116/hr{bill_id}"
    status_html = requests.get(status_url)
    if status_html.status_code == 404:
        return NO_SUCH_BILL

    soup = bs4.BeautifulSoup(status_html.text, 'html.parser')
    # oh this is so fragile
    rows = soup.findAll("div", {"class": ["row"]})
    status = None
    for row in rows:
        status = row.findAll("div", {"class": ["col-sm-9", "col-md-10"]})
        if status:
            break
    if not status:
        raise TypeError("Can't find it")
    summary_parts = []
    if "Died in a previous Congress" in str(status[1].contents):
        return "Died in a previous Congress"
    if "incorporated" in str(status[1].contents):
        return "Incorporated into another bill"
    for element in status[1].contents:
        if "<script>" in str(element):
            continue
        if not hasattr(element, "text"):
            continue
        text = element.text
        if text.strip() and "—" in text:
            summary_parts.append(text.split("—")[0].strip())
    if not summary_parts:
        print("Uh oh")
        print(status[1].contents)
    return "\n".join(summary_parts)


def locate_file(file_name: str, executing_file: str) -> str:
    """
    Find file relative to a source file, e.g.
    locate("foo/bar.txt", __file__)

    Succeeds regardless to context of execution
    """
    file_path = os.path.join(
        os.path.dirname(os.path.abspath(executing_file)), file_name
    )
    return file_path


def load_all(start: int, end: int) -> Tuple[int, int]:
    # file_name = locate_file("data2.csv", __file__)
    file_name = "data.csv"
    try:
      os.remove(file_name)
    except FileNotFoundError:
      pass
    success = 0
    errors = 0
    with open(file_name, 'a', newline='') as csvfile:
        for bill_id in range(start, end):
            try:
                summary = get_summary(bill_id=bill_id)
                if summary == NO_SUCH_BILL:
                    continue
                status = get_status(bill_id=bill_id)
                writer = csv.writer(csvfile)
                writer.writerow([bill_id, status, summary])
                print(bill_id, "status", status, "summary",
                      summary.split("\n")[0][0:80] + "...")
                success += 1
            except Exception as ex:
                errors += 1
                print(ex)
    return success, errors


Load data from original source. This is slow, consider using the prebuilt dataset in next step.

In [None]:
load_all(1, 20)

1 status Died in a previous Congress summary For the People Act of 2019...
2 status Died in a previous Congress summary Investing in a New Vision for the Environment and Surface Transportation in Amer...
3 status Died in a previous Congress summary Elijah E. Cummings Lower Drug Costs Now Act...
4 status Died in a previous Congress summary Voting Rights Advancement Act of 2019...
5 status Died in a previous Congress summary Equality Act...
6 status Died in a previous Congress summary American Dream and Promise Act of 2019...
7 status Died in a previous Congress summary Paycheck Fairness Act...
8 status Died in a previous Congress summary Bipartisan Background Checks Act of 2019...
9 status Died in a previous Congress summary Climate Action Now Act...
11 status Died in a previous Congress summary No summary available....
12 status Died in a previous Congress summary China Task Force Act or the CTF Act...
13 status Died in a previous Congress summary No summary available....
14 status Die

(14, 0)

Here we load the data from Github. 1000 rows of data.


In [1]:
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import io
import requests

# Use a file that you download
# input_file = "data.csv"
# comma delimited is the default
# df = pd.read_csv(input_file,
#                  names=["bill_id", "status", "summary"],
#                  encoding='cp1252')

# use a premade file
url="https://raw.githubusercontent.com/matthewdeanmartin/bills_with_nlp/main/data.csv"
s=requests.get(url).content
df=pd.read_csv(io.StringIO(s.decode('cp1252')),
              names=["bill_id", "status", "summary"])

df.head()



ModuleNotFoundError: No module named 'sklearn'

Stemming removes some noise from the data, in particular, we don't want to be so strict about what words are the same. "Apple", "Apples" should be treated as the same work. Also remove noise like numbers and things that are numberlike.

In [48]:
# show stemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

ps = PorterStemmer()
 
# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "programmers"]
def stem(text):
  stems = (ps.stem(w.translate(str.maketrans('', '', string.punctuation))) for w in text.split())
  word_stems = filter(lambda x: x.isalpha(), stems)
  return " ".join(word_stems)

df["summary"] = df["summary"].apply(stem)

df.head()

Unnamed: 0,bill_id,status,summary
0,1,Died in a previous Congress,for the peopl act of thi bill address voter ac...
1,2,Died in a previous Congress,invest in a new vision for the environ and sur...
2,3,Died in a previous Congress,elijah E cum lower drug cost now act thi bill ...
3,4,Died in a previous Congress,vote right advanc act of thi bill establish ne...
4,5,Died in a previous Congress,equal act thi bill prohibit discrimin base on ...


Now we get the data into a format that sklearn can use-- we spit columns into X's (predictors) and Y's (things to be predicted)


In [58]:
# Getting features of dataframe
# Same as SELECT feature1, feature2, feature3 FROM data
# X = data[["feature1", "feature2", "feature3"]]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["summary"])
print(vectorizer.get_feature_names())

y = df['status']  # Labels

pd.DataFrame(X).head()
pd.DataFrame(y).head()

# Split dataset into training set and test set
# Set random state to make runs repeatable
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)
print("Training Xs")
pd.DataFrame(X_train).head()

print("Testing Xs")
pd.DataFrame(X_train).head()


['abandon', 'abat', 'abba', 'abbi', 'abduct', 'abil', 'abingdon', 'abkhazia', 'abl', 'aboard', 'abolish', 'abort', 'abortionrel', 'about', 'abov', 'abovethelin', 'abraham', 'abridg', 'abroad', 'absenc', 'absent', 'abu', 'abut', 'academ', 'academi', 'acadia', 'accel', 'accept', 'access', 'accessori', 'accid', 'accommod', 'accompani', 'accomplish', 'accord', 'account', 'accredit', 'accru', 'accur', 'accuraci', 'achiev', 'acidif', 'acknowledg', 'acl', 'aco', 'acoronaviru', 'acoverag', 'acquir', 'acquisit', 'acr', 'acreag', 'acreforacr', 'across', 'act', 'action', 'activ', 'activeduti', 'actor', 'actthi', 'actual', 'acut', 'ad', 'adam', 'adapt', 'add', 'addit', 'additiondo', 'address', 'adequ', 'adequaci', 'adher', 'adjac', 'adjud', 'adjudicatori', 'adjust', 'administ', 'administr', 'admiss', 'admit', 'admitt', 'adopt', 'adrenoleukodystrophi', 'adult', 'advanc', 'advancededuc', 'advantag', 'adver', 'adversari', 'adverti', 'advi', 'advic', 'advisor', 'advisori', 'advoc', 'advocaci', 'aemplo

Unnamed: 0,0
0,"(0, 1742)\t9\n (0, 4526)\t10\n (0, 53)\t1\..."
1,"(0, 1742)\t5\n (0, 4526)\t15\n (0, 53)\t2\..."
2,"(0, 1742)\t4\n (0, 4526)\t4\n (0, 53)\t2\n..."
3,"(0, 1742)\t4\n (0, 4526)\t4\n (0, 3350)\t1..."
4,"(0, 1742)\t2\n (0, 4526)\t24\n (0, 53)\t1\..."


In [None]:

# Random forests are a good model for when you have many features and relatively
# few data points (e.g. 10,000s of predictive words, yet only 100s of bills)
# Set random state to make runs repeatable
clf = RandomForestClassifier(n_estimators=250, random_state=42)

# Train the model using the training sets
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

pd.DataFrame(y_pred).head()

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(clf.feature_importances_)
i = 0

# should use pandas way...
importances = list(zip(vectorizer.get_feature_names(), clf.feature_importances_))
importances.sort(key=lambda x:x[1])
importances = filter(lambda score: score[1] > 0, importances)

i = 0
for feat, importance in itertools.islice(importances, 20):
    if importance == 0:
        continue
    print('feature: {f}, importance: {i}'.format(f=feat, i=importance))
    i +=1 

print(f"Features scored: {i}")

# new_bill_text =
# new_bill = vectorizer.fit_transform()
# ad_hoc = clf.predict([""])


