In [79]:
"""
This notebook will be hard for anyone else to use because I didn't package in the source files (bill analyses) I 
downloaded from TLO. The commands I used to get them were: 

wget -r -nH -nd -np -R index.html* ftp://ftp.legis.state.tx.us/bills/85R/analysis/html/senate_bills/
wget -r -nH -nd -np -R index.html* ftp://ftp.legis.state.tx.us/bills/85R/analysis/html/house_bills/

The code in the notebook scrapes out the sections labeled "BACKGROUND AND PURPOSE" or 
"AUTHOR'S / SPONSOR'S STATEMENT OF INTENT", and collects them in a JSON file.

See the issue at https://github.com/open-austin/influence-texas/issues/10
"""

# This part is just for testing

from bs4 import BeautifulSoup

senate_doc = "../../TXJournal/data/analysis/HB00004S.htm"
house_doc = "../../TXJournal/data/analysis/HB00053H.htm"

soup = BeautifulSoup(open(house_doc), 'html.parser')

print(soup.prettify())

<!-- saved from url=(0098)file:///Users/Matt/Documents/OpenAustin/TXJournal/bills/85R/analysis/html/house_bills/HB00053H.htm -->
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="Microsoft Word 14 (filtered)" name="Generator"/>
  <title>
   85(R) HB 53 - Committee Report (Substituted) version - Bill Analysis
  </title>
  <style>
   <!--
 /* Font Definitions */
 @font-face
	{font-family:Shruti;
	panose-1:2 11 5 2 4 2 4 2 2 3;}
@font-face
	{font-family:Shruti;
	panose-1:2 11 5 2 4 2 4 2 2 3;}
@font-face
	{font-family:Tahoma;
	panose-1:2 11 6 4 3 5 4 4 2 4;}
 /* Style Definitions */
 p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0in;
	margin-bottom:.0001pt;
	font-size:12.0pt;
	font-family:"Times New Roman","serif";}
h1
	{mso-style-link:"Heading 1 Char";
	margin:0in;
	margin-bottom:.0001pt;
	text-align:center;
	page-break-after:avoid;
	font-size:12.0pt;
	font-family:"Times New Roman","serif";
	text-decoration:underline;}
p.MsoComment

In [123]:
commonHeadings = ["BACKGROUND AND PURPOSE", "AUTHOR'S /\nSPONSOR'S STATEMENT OF INTENT", 
                  "AUTHOR'S / SPONSOR'S\nSTATEMENT OF INTENT", 
                  "AUTHOR'S / SPONSOR'S STATEMENT OF INTENT"]

def getHeading(soup):
    
    heading = soup.find(name = "u", text = commonHeadings)
    if heading == None:
        try:
            if soup.find("h1").string == "BILL ANALYSIS":
                heading = soup.find_all("u")[0]
            else:
                heading = soup.find_all("u")[1]
        except:
            heading = soup.find_all("u")[1]
    return heading

In [124]:
heading = getHeading(soup)

In [163]:
import re

def getAnalysis(heading):
    response = ""
    for element in heading.next_elements:
        try:
            if element.string == 'RULEMAKING\nAUTHORITY' or \
            element.string == "CRIMINAL JUSTICE IMPACT":
                return response.replace("\xa0", "").strip().replace("  ", " ")
        except:
            pass
        try:
            if element.name == "u":
                return response
            elif element.string != None and element.string not in commonHeadings:
                newString = element.string.strip().replace("\n", " ") + " "
                newerString = re.sub("[ï½¿]", "", newString)
                if response.endswith(newerString) == False:
                    response = response + newerString
        except: 
            print("fail")
            response = response + element.string
    return response.replace("\xa0", " ").strip().replace("  ", " ")

getAnalysis(heading)

"Interested parties are  concerned that when a governmental entity enters into a settlement agreement paired  with a non-disclosure agreement, the public's interest is adversely affected due  to a lack of transparency regarding the basic allegations or facts of the  case, even though taxpayer dollars are being used to investigate and  prosecute the case and to pay the settlement. C.S.H.B. 53 seeks to increase  transparency by prohibiting a governmental unit from entering into certain settlement  agreements if the aggrieved party is required to consent to a non-disclosure  agreement as a condition of the settlement."

In [164]:
stages = {"E": "Engrossed", "F": "Enrolled", "H": "Committee Report (Unamended)",
          "I": "As Filed", "S": "Committee Report (Substituted)"}

def getRecord(filename):
    symbol = filename[7:8]
    try:
        stage = stages[symbol]
    except:
        print("Failed to match code to stage")
        stage = "Unknown"
    return int(filename[2:7]), stage


In [165]:
import os
from bs4 import BeautifulSoup

folderName = "../../TXJournal/bills/85R/analysis/html/"

from collections import defaultdict
def tree(): return defaultdict(tree) # needed to allow assignment to nested dict

def getBills(folderName, chamber):
    if chamber == "senate":
        prefix = "SB "
    else:
        prefix = "HB "
    subFolderName = folderName + chamber + "_bills/"
    bills = tree()
    for folderName, subfolders, filenames in os.walk(subFolderName):
        for filename in filenames: # can be limited to 10 for testing
            if filename != ".DS_Store":
                number, stage = getRecord(filename)
                name = prefix + str(number)
                source = subFolderName + filename
                markup = open(source, encoding = "windows-1252")
                soup = BeautifulSoup(markup, "html.parser", from_encoding="windows-1252")
                heading = getHeading(soup)
                analysis = getAnalysis(heading)
                bills[name][stage] = analysis
    return bills

houseBills = getBills(folderName, "house")

In [166]:
senateBills = getBills(folderName, "senate")

In [167]:
allBills = {**houseBills, **senateBills}

In [168]:
import json

with open('../data/billAnalysis.json', 'w') as fp:
    json.dump(allBills, fp)