In [1]:
# I used wget to collect ftp.legis.state.tx.us/bills/85R/witlistbill/html/senate_bills/
# and put them all together in a directory called '/bills/85R/witlistbill/html/senate_bills/'

from bs4 import BeautifulSoup
import re

In [2]:
endWithState = re.compile(r',?\s?(AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL|IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)\n?$')

In [3]:
def addName(line):
    flags = [re.IGNORECASE, re.DOTALL]
    regexes = [r'^((?:\w+\s)*\w+),\n((?:\w+\s)*\w+)\xa0\xa0 ', r'^((?:\w+\s)*\w+),\n((?:\w+\s)*\w+)\s+\(', 
              r'^((?:\w+\s)*\w+),\s+([^\(]+) ', r'^((?:\w+\s)*\w+),\s+((?:\w+\s)*\w+)  '] 
    partial = r'^(\w+)\s+\n\('  # Matches where someone put just a surname
    for f in flags:
        for r in regexes:
            nameRe = re.compile(r, f)
            match = re.search(nameRe, line[0])
            if match:
                extension = [match.group(1).strip(), match.group(2).strip()]
                return extension
    
    # A separate case for the partial match
    nameRe = re.compile(partial)
    if re.search(nameRe, line[0]):
        match = re.search(nameRe, line[0])
        extension = [match.group(1).strip(), None]
        return extension
    
    # And a third case to leave the fields blank
    extension = [None, None]
    return extension

def addTitle(line):
    regexes = [r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(also providing', r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s(.+)\(also providing',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s(.+)\('
              #  r'^.+,\n.+\s+(.+)\s+\(also providing', r'^.+,\n.+\s+(.+)\s+\(',
              # ',\n?.+\xa0\xa0 ((?:\w+\s)*\w+)\s+(\(also providing written testimony\))?\s+\('
              ]
    for r in regexes:
        titleRe = re.compile(r)
        match = re.search(titleRe, line[0])
        if match:
            extension = [match.group(1).strip()]
            return extension
    extension = [None]
    return extension

def addOrg(line):
    extension = [None]
    orgRe = re.compile(r'written testimony\)\s+\((.*\n?.*)\)(,|$)')
    match = re.search(orgRe, line[0])
    if match:
        extension = [match.group(1).strip()]
    else: 
        altOrgRe = re.compile(r'\((.*\n?.*)\)(\s?,\s.*,\s.*)?$')
        match = re.search(altOrgRe, line[0])
        if match:
            extension = [match.group(1).strip()]
    return extension

def addCity(line):
    extension = [None, None]
    cityRe = re.compile(r'\)\s?,\s(.*),\s(.*)$')
    match = re.search(cityRe, line[0])
    if match:
        extension = [match.group(1).strip(), match.group(2).strip()]
    return extension

addName(["Castro,\nAmy   Asst. Speech Pathology  (also providing written testimony), (Child's Play","HB 2","For"])

['Castro', 'Amy   Asst. Speech Pathology']

In [4]:
def HBWitness(witnessList):
    fp = open(witnessList, encoding = 'cp1252')
    soup = BeautifulSoup(fp.read(), "html.parser")
    
    # each printed page has a class with Wordsection plus a number
    pages = soup.select("div[class^=WordSection]") 
    
    indent = re.compile(r"^\s{30}") # whitespace should mean it's a listing, page number, or the heading "WITNESS LIST"
    forStance = re.compile(r'^\s+FOR\s?:', re.IGNORECASE) # i flag means case-insensitive
    againstStance = re.compile(r'^\s+AGAINST\s?:', re.IGNORECASE)
    onStance = re.compile(r'^\s+ON\s?:', re.IGNORECASE)

    wit = []
    stance = "For"
    for page in pages:
        witpage = []
        pp = page.select("p")
        for line in pp:
            text = line.get_text()
            if re.match(forStance, text):
                stance = "For"
            if re.match(againstStance, text):
                stance = "Against"
            if re.match(onStance, text):
                stance = "On"
            if re.match(indent, text):
                witpage.append([line.get_text(), stance])
        wit.extend(witpage[1:-1])

    bill = soup.find('span', {'style':"color:windowtext;text-decoration:none"}) # identifies the bill
    bill = bill.get_text()
    for line in wit:
        line.extend([bill.strip()])
    
    wit = [x for x in wit if x != None]
    
    return wit

wit = HBWitness('bills/85R/witlistbill/html/house_bills/HB00002S.HTM')
print(wit)

[['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Carter,\nBobbie\xa0\xa0 Physical Therapist\xa0 (Countryside Therapy Group, Inc.), Rising Star, TX', 'For', 'HB 2'], ["\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Castro,\nAmy\xa0\xa0 Asst. Speech Pathology\xa0 (also providing written testimony)\xa0 (Child's Play\n", 'For', 'HB 2'], ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Home\nHealth)', 'For', 'HB 2'], ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Hammon,\nRachel\xa0\xa0 TX Association for Home Care & Hospice

In [5]:
"""
wit = [["Castro,\nAmy   Asst. Speech Pathology  (also providing written testimony), (Child's Play","HB 2","For"],
       ["Home\nHealth)","HB 2","For"],
        ["hi","HB 2","For"]]
mergelines(wit)
"""


"""
wit = [["\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Castro,\nAmy\xa0\xa0 Asst. Speech Pathology\xa0 (also providing written testimony)\xa0 (Child's Play\n",
        "HB 2","For"],
       ["\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Castro,\nAmy\xa0\xa0 Asst. Speech Pathology\xa0 (also providing written testimony)\xa0 (Child's Play\n",
        "HB 2","For"],
        ["hi","HB 2","For"]]
"""

def mergelines(wit):
    changed = 0    
    newList = []
    badList = []
    tooShort = re.compile("^\w+,?\s?(AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL\
                        |IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC\
                        |ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)?\n?$") # line shouldn't have just one word
    
    for line in wit[:-1]:
        lineIndex = wit.index(line)
        
        # Lines will get merged if they pass any of the three tests. Only if not will a line be accepted.
        
        if re.search(endWithState, wit[lineIndex + 1][0].strip()) \
            and not re.search(endWithState, line[0].strip())\
            and addName(line) != [None, None] \
            and addName(wit[lineIndex + 1]) == [None, None]:
                newList.append([line[0].strip() + " " + wit[lineIndex + 1][0].strip(), line[1], line[2]])
                badList.append([wit[lineIndex + 1][0].strip(), line[1], line[2]])
                changed += 1
        
        elif re.search(tooShort, wit[lineIndex + 1][0].strip()):
            newList.append([line[0].strip() + " " + wit[lineIndex + 1][0].strip(), line[1], line[2]])
            badList.append([wit[lineIndex + 1][0].strip(), line[1], line[2]])
            changed += 1
            
        elif line[0].count(')') != line[0].count('(') \
            and wit[lineIndex + 1][0].count(')') != wit[lineIndex + 1][0].count('(') \
            and addName(line) != [None, None] \
            and not re.search(endWithState, line[0].strip()):
                newList.append([line[0].strip() + " " + wit[lineIndex + 1][0].strip(), line[1], line[2]])
                badList.append([wit[lineIndex + 1][0].strip(), line[1], line[2]])
                changed += 1
        else:
            newList.append([line[0].strip(), line[1], line[2]])
    
    newList = [x for x in newList if x not in badList]
    
    return newList, badList

# TK: this function is working on the dummy list, but not on the real data in the dataframe below. 
# What's wrong with it?

# Okay, now it's not working on the dummy data either. How can it ever spit out the same line twice?



In [None]:
addName(newList[2])

In [None]:
print(re.search(endWithState, newList[0][0].strip()))

In [6]:
wit

[['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Carter,\nBobbie\xa0\xa0 Physical Therapist\xa0 (Countryside Therapy Group, Inc.), Rising Star, TX',
  'For',
  'HB 2'],
 ["\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Castro,\nAmy\xa0\xa0 Asst. Speech Pathology\xa0 (also providing written testimony)\xa0 (Child's Play\n",
  'For',
  'HB 2'],
 ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Home\nHealth)',
  'For',
  'HB 2'],
 ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Hammon,\nRachel\xa0\xa0 TX Association for Home

In [7]:
# trying to rejoin entries split across lines
newList, badList = mergelines(wit)
print(badList)
    
for line in wit:
    line.extend(addName(line))
    line.extend(addTitle(line))
    line.extend(addOrg(line))
    line.extend(addCity(line))

[['Austin', 'For', 'HB 2'], ['Austin,\nTX', 'On', 'HB 2']]


In [9]:
newList, badList = mergelines(newList)
newList

[['Carter,\nBobbie\xa0\xa0 Physical Therapist\xa0 (Countryside Therapy Group, Inc.), Rising Star, TX',
  'For',
  'HB 2'],
 ["Castro,\nAmy\xa0\xa0 Asst. Speech Pathology\xa0 (also providing written testimony)\xa0 (Child's Play Home\nHealth)",
  'For',
  'HB 2'],
 ['Hammon,\nRachel\xa0\xa0 TX Association for Home Care & Hospice (TAHCH)\xa0 (TAHCH), Austin',
  'For',
  'HB 2'],
 ['Montgomery,\nLaura\xa0\xa0 Occupational Therapist\xa0 (Countryside Therapy Group & Home Health),\nStephenville',
  'For',
  'HB 2'],
 ['West,\nBarbara\xa0\xa0 Director of Marketing -\xa0 (also providing written testimony)\xa0 (Circle\nof Care -\nPediatric Therapy), Bulverde, TX',
  'For',
  'HB 2'],
 ['Adams,\nJosh\xa0\xa0 Atlas Pediatric Therapy, Small Business Owner Home Health Agency (Speech,\nPhysical, Occupational Therapy Industry & Families), Arlington, TX',
  'On',
  'HB 2'],
 ['Colbert,\nPaul\xa0\xa0 consultant\xa0 (also providing written testimony)\xa0 (Lee College), Houston,\nTX',
  'On',
  'HB 2'],
 

In [10]:
badList

[['Home\nHealth)', 'For', 'HB 2'],
 ['Health),\nStephenville', 'For', 'HB 2'],
 ['-\nPediatric Therapy), Bulverde, TX', 'For', 'HB 2'],
 ['(Speech,\nPhysical, Occupational Therapy Industry & Families), Arlington, TX',
  'On',
  'HB 2'],
 ['medically\nfragile children), Flower Mound, TX', 'On', 'HB 2'],
 ['2000),\nDallas, TX', 'On', 'HB 2']]

In [None]:
import shutil, os

houseWit = []

folderName = 'bills/85R/witlistbill/html/house_bills/'

for folderName, subfolders, filenames in os.walk(folderName):
    for filename in filenames:
        if filename != ".DS_Store":
            source = folderName + filename
            wit = HBWitness(source)
            # print(str(changed) + " lines changed.")
            houseWit.extend(wit)


In [None]:
houseWit[:5]

In [None]:
import pandas as pd

df = pd.DataFrame(houseWit, columns=list(['raw', 'position', 'bill', 'last','first','title', 'org','city','state']))

In [None]:
df[df['bill'] == "HB 3025"]

In [None]:
pd.options.display.max_colwidth = 500
df[pd.isnull(df['org'])]
# df.first.isnull()

In [None]:

df[df['bill'] == "HB 2"]

In [None]:
noText = []
for line in houseWit:
    noText.append(line[1:])

def changeN(cell):
    if cell != None:
        cell = cell.replace("\n", " ")
    return cell
    
noText = [[changeN(cell) for cell in line] for line in noText]
    

In [None]:
# noText[10]

In [None]:
houseWit[10:20]

In [None]:
import csv

with open('..\data\witness-lists\HouseWitness.csv','w') as f:
    writer = csv.writer(f)
    writer.writerow(['Position', 'Bill', 'LastName', 'FirstName', 'Role', 'Organization', 'City', 'State'])
    writer.writerows(noText)