In [1]:
# I used wget to collect ftp.legis.state.tx.us/bills/85R/witlistbill/html/senate_bills/
# and put them all together in a directory called '/bills/85R/witlistbill/html/senate_bills/'

from bs4 import BeautifulSoup
import re

In [2]:
endWithState = re.compile(r'.*,?\s?(AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL\
                        |IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC\
                        |ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|UT|VT|VI|VA|WA|WV|WI|WY)\n?$')

In [3]:
# TODO: consolidate these into one function

def addName(line):
    nameRe = re.compile(r'^(.+),\n(.+)\xa0\xa0 ')
    nameRe2 = re.compile(r'^(.+),\n(.+)\s+\(')
    nameRe3 = re.compile(r'^(.+),\s+(.+) ')
    if re.search(nameRe, line[0]):
        match = re.search(nameRe, line[0])
        extension = [match.group(1).strip(), match.group(2).strip()]
    elif re.search(nameRe2, line[0]): 
        match = re.search(nameRe2, line[0])
        extension = [match.group(1).strip(), match.group(2).strip()]
    elif re.search(nameRe3, line[0]): 
        match = re.search(nameRe3, line[0])
        extension = [match.group(1).strip(), match.group(2).strip()]
    else: 
        extension = [None, None]
    line.extend(extension)
    return line

def addTitle(line):
    extension = [None]
    # titleRe = re.compile(r',\n?.+\xa0\xa0 (.+)\s*(\(also providing written testimony\))?\s+\(')
    titleRe = re.compile(r'^.+,\n.+\s+(.+)\s+\(also providing')
    match = re.search(titleRe, line[0])
    if match:
        extension = [match.group(1).strip()]
    else: 
        altTitleRe = re.compile(r'^.+,\n.+\s+(.+)\s+\(')
        match = re.search(altTitleRe, line[0])
        if match:
            extension = [match.group(1).strip()]
    line.extend(extension)
    return line

def addOrg(line):
    extension = [None]
    orgRe = re.compile(r'written testimony\)\s+\((.*\n?.*)\)(,|$)')
    match = re.search(orgRe, line[0])
    if match:
        extension = [match.group(1).strip()]
    else: 
        altOrgRe = re.compile(r'\((.*\n?.*)\)(\s?,\s.*,\s.*)?$')
        match = re.search(altOrgRe, line[0])
        if match:
            extension = [match.group(1).strip()]
    line.extend(extension)
    return line

def addCity(line):
    extension = [None, None]
    cityRe = re.compile(r'\)\s?,\s(.*),\s(.*)$')
    match = re.search(cityRe, line[0])
    if match:
        extension = [match.group(1).strip(), match.group(2).strip()]
    line.extend(extension)
    return line

In [4]:
def HBWitness(witnessList):
    fp = open(witnessList, encoding = 'cp1252')
    soup = BeautifulSoup(fp.read(), "html.parser")
    
    # each printed page has a class with Wordsection plus a number
    pages = soup.select("div[class^=WordSection]") 
    
    indent = re.compile(r"^\s{30}") # whitespace should mean it's a listing, page number, or the heading "WITNESS LIST"
    forStance = re.compile(r'^\s+FOR\s?:', re.IGNORECASE) # i flag means case-insensitive
    againstStance = re.compile(r'^\s+AGAINST\s?:', re.IGNORECASE)
    onStance = re.compile(r'^\s+ON\s?:', re.IGNORECASE)

    # Suspiciously high number of lines labeled "FOR"
    wit = []
    stance = "For"
    for page in pages:
        witpage = []
        pp = page.select("p")
        for line in pp:
            text = line.get_text()
            if re.match(forStance, text):
                stance = "For"
            if re.match(againstStance, text):
                stance = "Against"
            if re.match(onStance, text):
                stance = "On"
            if re.match(indent, text):
                witpage.append([line.get_text(), stance])
        wit.extend(witpage[1:-1])

    bill = soup.find('span', {'style':"color:windowtext;text-decoration:none"}) # identifies the bill
    bill = bill.get_text()
    for line in wit:
        line.extend([bill.strip()])
    
    newList = []
    badList = []
    oneWord = re.compile("^\w+$") # line shouldn't have just one word
    
    # trying to rejoin entries split across lines
    for line in wit[:-1]: # no longer iterating backwards to avoid breaking list during iteration
        lineIndex = wit.index(line)
        if line != None:
            if not (re.search(endWithState, wit[lineIndex + 1][0].strip()) \
                    and not re.search(endWithState, line[0].strip())) \
                    and line[0].count(')') >= line[0].count('(') and wit[lineIndex + 1][0][0] != '(' \
                    and not re.search(oneWord, wit[lineIndex + 1][0].strip()):
                newList.append([line[0].strip(), line[1], line[2]])
            else: 
                newList.append([line[0].strip() + " " + wit[lineIndex + 1][0].strip(), line[1], line[2]])
                badList.append([wit[lineIndex + 1][0].strip(), line[1], line[2]])
    
    newList = [x for x in newList if x not in badList]
    
    for line in newList:
        line = addName(line)
        line = addTitle(line)
        line = addOrg(line)
        line = addCity(line)
    
    return newList


In [5]:

import shutil, os

houseWit = []

folderName = 'bills/85R/witlistbill/html/senate_bills/'

for folderName, subfolders, filenames in os.walk(folderName):
    for filename in filenames:
        if filename != ".DS_Store":
            source = folderName + filename
            wit = HBWitness(source)
            houseWit.extend(wit)

# houseWit[:5]

[['Keyton,\nSarah (Legislative Budget Board)',
  'On',
  'SB 1',
  'Keyton',
  'Sarah',
  None,
  'Legislative Budget Board',
  None,
  None],
 ['Keyton,\nSarah (Legislative Bufget Board)',
  'On',
  'SB 1',
  'Keyton',
  'Sarah',
  None,
  'Legislative Bufget Board',
  None,
  None],
 ['Aleman,\nSteven\xa0\xa0 policy specialist\xa0 (also providing written testimony)\xa0 (Disability\nRights Texas),\nAustin, TX',
  'On',
  'SB 1',
  'Aleman',
  'Steven',
  'specialist',
  'Disability\nRights Texas',
  'Austin',
  'TX'],
 ['Anger,\nMartha\xa0\xa0 Self-Employed (Filmmaker)\xa0 (as an Alumni), Austin, TX',
  'On',
  'SB 1',
  'Anger',
  'Martha',
  '(Filmmaker)',
  'Filmmaker)\xa0 (as an Alumni',
  'Austin',
  'TX'],
 ['Banks,\nYannis\xa0\xa0\xa0 (also providing written testimony)\xa0 (Texas NAACP)',
  'On',
  'SB 1',
  'Banks',
  'Yannis',
  '',
  'Texas NAACP',
  None,
  None]]

In [6]:
noText = []
for line in houseWit:
    noText.append(line[1:])

def changeN(cell):
    if cell != None:
        cell = cell.replace("\n", " ")
    return cell
    
noText = [[changeN(cell) for cell in line] for line in noText]
    

In [7]:
# noText[10]

['On',
 'SB 1',
 'Carpenter',
 'Dr. Clint',
 'Dist.',
 'Windham School District',
 None,
 None]

In [10]:
# houseWit[10:20]

[['Carpenter,\nDr. Clint\xa0\xa0 Superintendent of Windham School Dist.\xa0 (also providing written testimony)\xa0\n(Windham School District)',
  'On',
  'SB 1',
  'Carpenter',
  'Dr. Clint',
  'Dist.',
  'Windham School District',
  None,
  None],
 ['testimony)\xa0\n(Windham School District) Clarke,\nDebbie\xa0\xa0 Mom, engineer\xa0 (also providing written testimony)\xa0 (Parent of Austin\nISD',
  'On',
  'SB 1',
  None,
  None,
  None,
  None,
  None,
  None],
 ['Clarke,\nDebbie\xa0\xa0 Mom, engineer\xa0 (also providing written testimony)\xa0 (Parent of Austin\nISD students)',
  'On',
  'SB 1',
  'Clarke',
  'Debbie',
  'engineer',
  'Parent of Austin\nISD students',
  None,
  None],
 ['Cowan,\nJulie\xa0\xa0 homemaker / school district trustee\xa0 (also providing written\ntestimony) (myself\n/ AISD), Austin, TX',
  'On',
  'SB 1',
  'Cowan',
  'Julie',
  'trustee',
  'myself\n/ AISD',
  'Austin',
  'TX'],
 ['Daugherty,\nWilliam\xa0\xa0 Superintendent\xa0 (also providing written testi

In [9]:
import csv

with open('..\data\witness-lists\SenateWitness.csv','w') as f:
    writer = csv.writer(f)
    writer.writerow(['Position', 'Bill', 'LastName', 'FirstName', 'Role', 'Organization', 'City', 'State'])
    writer.writerows(noText)