In [1]:
# I used wget to collect ftp.legis.state.tx.us/bills/85R/witlistbill/html/senate_bills/
# and put them all together in a directory called '/bills/85R/witlistbill/html/senate_bills/'

from bs4 import BeautifulSoup
import re

In [2]:
endWithState = re.compile(r'[^\(](AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL|IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)\n?$')

In [3]:
def addName(line):
    flags = [re.IGNORECASE, re.DOTALL]
    regexes = [r'^((?:\w+\s)*\w+),\n((?:\w+\s)*\w+)\xa0\xa0 ', r'^((?:\w+\s)*\w+),\n((?:\w+\s)*\w+)\s+\(', 
              r'^((?:\w+\s)*\w+),[\n\s]+((?:[A-Z]\.)+)\s\s',
              r'^((?:\w+\s)*\w+),\s+([^\(]+) ', 
              r'^((?:\w+\s)*\w+),\s+((?:\w+\s)*\w+)  '] # second one for initials as first name
    partial = r'^(\w+)\s+\n\('  # Matches where someone put just a surname
    for f in flags:
        for r in regexes:
            nameRe = re.compile(r, f)
            match = re.search(nameRe, line[0])
            if match:
                extension = [match.group(1).strip(), match.group(2).strip()]
                return extension
    
    # A separate case for the partial match
    nameRe = re.compile(partial)
    if re.search(nameRe, line[0]):
        match = re.search(nameRe, line[0])
        extension = [match.group(1).strip(), None]
        return extension
    
    # And a third case to leave the fields blank
    extension = [None, None]
    return extension

def addTitle(line):
    regexes = [r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(also providing', r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s+(.+)\(also providing',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s+(.+)\('
              #  r'^.+,\n.+\s+(.+)\s+\(also providing', r'^.+,\n.+\s+(.+)\s+\(',
              # ',\n?.+\xa0\xa0 ((?:\w+\s)*\w+)\s+(\(also providing written testimony\))?\s+\('
              ]
    for r in regexes:
        titleRe = re.compile(r)
        match = re.search(titleRe, line[0])
        if match:
            extension = [match.group(1).strip()]
            return extension
    extension = [None]
    return extension

def addOrg(line):
    flags = [re.IGNORECASE, re.DOTALL]
    regexes = [r'written testimony\)\s+\((.+)\)\s*(,|$)', 
              r'\s+\((.+)\)\s*(,|$)',]
    for f in flags:
        for r in regexes:
            orgRe = re.compile(r, f)
            match = re.search(orgRe, line[0])
            if match:
                extension = [match.group(1).strip()]
                return extension
    extension = [None]
    return extension

def addCity(line):
    extension = [None, None]
    cityRe = re.compile(r'\)\s?,\s(.*),\s(.*)$')
    match = re.search(cityRe, line[0])
    if match:
        extension = [match.group(1).strip(), match.group(2).strip()]
    return extension

addOrg(["Parks,\nUrsula (Legislative Budget Board)","HB 2","For"])

['Legislative Budget Board']

In [4]:
# I haven't made a corresponding function for Senate bills. Instead I
# used this same function, which means the Senate data probably has
# more errors.

def HBWitness(witnessList):
    fp = open(witnessList, encoding = 'cp1252')
    soup = BeautifulSoup(fp.read(), "html.parser")
    
    # each printed page has a class with Wordsection plus a number
    pages = soup.select("div[class^=WordSection]") 
    
    indent = re.compile(r"^\s{30}") # whitespace should mean it's a listing, page number, or the heading "WITNESS LIST"
    forStance = re.compile(r'^\s+FOR\s?:', re.IGNORECASE) # i flag means case-insensitive
    againstStance = re.compile(r'^\s+AGAINST\s?:', re.IGNORECASE)
    onStance = re.compile(r'^\s+ON\s?:', re.IGNORECASE)

    wit = []
    stance = "For"
    for page in pages:
        witpage = []
        pp = page.select("p")
        for line in pp:
            text = line.get_text()
            if re.match(forStance, text):
                stance = "For"
            if re.match(againstStance, text):
                stance = "Against"
            if re.match(onStance, text):
                stance = "On"
            if re.match(indent, text):
                witpage.append([line.get_text(), stance])
        wit.extend(witpage[1:-1])

    bill = soup.find('span', {'style':"color:windowtext;text-decoration:none"}) # identifies the bill
    bill = bill.get_text()
    for line in wit:
        line.extend([bill.strip()])
    
    wit = [x for x in wit if x != None]
    
    return wit

# wit = HBWitness('bills/85R/witlistbill/html/house_bills/HB00002S.HTM')
# print(wit)

In [5]:
def mergelines(wit):
    changed = 0    
    newList = []
    badList = []
    tooShort = re.compile("^\w+,?\s?(AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL\
                        |IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC\
                        |ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)?\n?$") # line shouldn't have just one word
    for line in wit:
        line[0] = line[0].strip()
    
    for line in wit[:-1]:
        lineIndex = wit.index(line)
        
        # Lines will get merged if they pass any of the three tests. Only if not will a line be accepted.
        
        if re.search(endWithState, wit[lineIndex + 1][0]) and not re.search(endWithState, line[0]) \
        and addName(line) != [None, None] and addName(wit[lineIndex + 1]) == [None, None]:
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
        
        elif re.search(tooShort, wit[lineIndex + 1][0].strip()):
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        elif line[0].count(')') != line[0].count('(') \
        and wit[lineIndex + 1][0].count(')') != wit[lineIndex + 1][0].count('(') \
        and addName(line) != [None, None] and not re.search(endWithState, line[0]):
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        elif line[0].count(')') != line[0].count('(') \
        and wit[lineIndex + 1][0].count(')') != wit[lineIndex + 1][0].count('(') \
        and addName(line) != [None, None] \
        and addName(wit[lineIndex + 1]) == [None,None]:
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        else:
            newList.append(line[0:3])
    
    answer = [x for x in newList if x not in badList]
    
    return answer


In [8]:
import shutil, os

def extractRows(folderName):
    
    houseWit = []
    for folderName, subfolders, filenames in os.walk(folderName):
        for filename in filenames:
            if filename != ".DS_Store":
                source = folderName + filename
                wit = HBWitness(source)
                # print(str(changed) + " lines changed.")
                houseWit.extend(wit)
    return houseWit
            
folderName = 'bills/85R/witlistbill/html/house_bills/'
houseWit = extractRows(folderName)

In [9]:
houseWit[:5]

[['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Parks,\nUrsula (Legislative Budget Board)',
  'On',
  'HB 2'],
 ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Pulver,\nWayne (Texas Legislative Budget Board)',
  'On',
  'HB 2'],
 ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Rodell,\nLeora (Legislative Budget Board)',
  'On',
  'HB 2'],
 ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Ashley,\nPhillip (Comptroller)',
  'On',
  'HB 2'],
 ['\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Borgstedte,\nAmy (Legislative Budget Board)',
  'On',
  'HB 2']]

In [10]:
# trying to rejoin entries split across lines
newList = mergelines(houseWit)


# check HB 51 Shepperd,\nJohn

In [11]:
newList = mergelines(newList) # Will doing it twice catch 3-line entries?


In [13]:
for line in newList:
    line.extend(addName(line))
    line.extend(addTitle(line))
    line.extend(addOrg(line))
    line.extend(addCity(line))

In [15]:
# pandas is only used for data exploration. 
# There are definitely still errors.

import pandas as pd
pd.options.display.max_colwidth = 500


df = pd.DataFrame(newList, columns=list(['raw', 'position', 'bill', 'last','first','title', 'org','city','state']))

In [16]:
df[df['bill'] == "HB 3025"]

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
20415,"Ellis,\nGregory (Self; Bandera County River Authority and Groundwater District,\nGonzales County\nUWCD and other clients of the firm.)",For,HB 3025,Ellis,Gregory,,"Self; Bandera County River Authority and Groundwater District,\nGonzales County\nUWCD and other clients of the firm.",,
20416,"Mauk,\nDavid (Self; Bandera County River Authority and Groundwater District)",For,HB 3025,Mauk,David,,Self; Bandera County River Authority and Groundwater District,,
20417,"Schlessinger,\nSarah (Texas Alliance of Groundwater Districts)",For,HB 3025,Schlessinger,Sarah,,Texas Alliance of Groundwater Districts,,
20418,"Landwehr,\nMartha (Texas Chemical Council)",On,HB 3025,Landwehr,Martha,,Texas Chemical Council,,
20419,"Adams,\nLeah (Panola County Groundwater Conservation District)",For,HB 3025,Adams,Leah,,Panola County Groundwater Conservation District,,
20420,"Conkwright,\nJim (Prairielands Groundwater Conservation District)",For,HB 3025,Conkwright,Jim,,Prairielands Groundwater Conservation District,,
20421,"Embrey,\nTy (Real Edwards Conservation and Reclamation District)",For,HB 3025,Embrey,Ty,,Real Edwards Conservation and Reclamation District,,
20422,"Escobar,\nVanessa (Self)",For,HB 3025,Escobar,Vanessa,,Self,,
20423,"Flatten,\nCharles (Self; Hill Country Alliance)",For,HB 3025,Flatten,Charles,,Self; Hill Country Alliance,,
20424,"Glass,\nTom (League of Independent Voters)",For,HB 3025,Glass,Tom,,League of Independent Voters,,


In [17]:
df[pd.isnull(df['org'])]
# df.first.isnull()

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
2073,"O'Ryan,\nAnne (also providing written testimony) (AAA TX, Interinsurance Exchange of",For,HB 62,,,,,,
2074,"the\nAuto Club, and Auto Club Co. Mutual), Austin, TX",For,HB 62,the\nAuto Club,"and Auto Club Co. Mutual), Austin,",,,Austin,TX
2635,Services),On,HB 122,,,,,,
4169,Association),For,HB 351,,,,,,
5232,"Houston,\nTx, TX",For,HB 490,Houston,"Tx,",,,,
5260,"Gear\nJr., Bob Director, Texas Veterans Leadership Program (Texas Workforce",On,HB 493,,,"Director, Texas Veterans Leadership Program",,,
5261,"Commission),\nAustin, TX",On,HB 493,,,,,Austin,TX
9242,"Austin,\nTexas, TX",On,HB 1260,Austin,"Texas,",,,,
10708,"Foundation),\nFredericksburg",For,HB 1492,,,,,,
12130,"Hall-Barrow,\nJulie Vice President of Virtual Health and Innovation (also providing",For,HB 1697,,,Vice President of Virtual Health and Innovation,,,


In [18]:
df[df['bill'] == "HB 2"]

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
0,"Parks,\nUrsula (Legislative Budget Board)",On,HB 2,Parks,Ursula,,Legislative Budget Board,,
1,"Pulver,\nWayne (Texas Legislative Budget Board)",On,HB 2,Pulver,Wayne,,Texas Legislative Budget Board,,
2,"Rodell,\nLeora (Legislative Budget Board)",On,HB 2,Rodell,Leora,,Legislative Budget Board,,
3,"Ashley,\nPhillip (Comptroller)",On,HB 2,Ashley,Phillip,,Comptroller,,
4,"Borgstedte,\nAmy (Legislative Budget Board)",On,HB 2,Borgstedte,Amy,,Legislative Budget Board,,
5,"Keyton,\nSarah (Legislative Budget Board)",On,HB 2,Keyton,Sarah,,Legislative Budget Board,,
6,"Wales,\nMichael (Legislative Budget Board)",On,HB 2,Wales,Michael,,Legislative Budget Board,,
7,"Keyton,\nSarah (Legislative Budget Board)",On,HB 2,Keyton,Sarah,,Legislative Budget Board,,
8,"Parks,\nUrsula (Legislative Budget Board)",On,HB 2,Parks,Ursula,,Legislative Budget Board,,
9,"Pulver,\nWayne (Legislative Budget Board)",On,HB 2,Pulver,Wayne,,Legislative Budget Board,,


In [19]:
df.sample(n = 40)

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
11437,"Holt,\nLynn (Self; Texas Justice Court Judges Association)",For,HB 1575,Holt,Lynn,,Self; Texas Justice Court Judges Association,,
18567,"Dudensing,\nJamie (Texas Association of Health Plans)",On,HB 2697,Dudensing,Jamie,,Texas Association of Health Plans,,
16855,"Archer,\nJimmy (Texas Department of Motor Vehicles)",On,HB 2433,Archer,Jimmy,,Texas Department of Motor Vehicles,,
4896,"Rocap,\nBlake (NARAL Pro-Choice Texas)",For,HB 443,Rocap,Blake,,NARAL Pro-Choice Texas,,
23606,"Kahn,\nSuzanne (Self)",Against,HB 3587,Kahn,Suzanne,,Self,,
9736,"Donahue,\nLaura (Self; Texas Humane Legislation Network)",For,HB 1357,Donahue,Laura,,Self; Texas Humane Legislation Network,,
18107,"Mills,\nWilliam (Self; Sheriff Association)",For,HB 2612,Mills,William,,Self; Sheriff Association,,
6097,"Mills,\nSarah (Texas Association for Home Care and Hospice)",For,HB 630,Mills,Sarah,,Texas Association for Home Care and Hospice,,
21186,"Berry,\nTraci (GoodwillCentral Texas)",For,HB 3130,Berry,Traci,,GoodwillCentral Texas,,
5241,"Sabo,\nJason (Children at Risk)",For,HB 491,Sabo,Jason,,Children at Risk,,


In [20]:
noText = []
for line in houseWit:
    noText.append(line[1:])

def changeN(cell):
    if cell != None:
        cell = cell.replace("\n", " ")
    return cell
    
newList = [[changeN(cell) for cell in line] for line in newList]
    

In [22]:
import csv

def export(dir, witList):
    with open(dir,'w') as f:
        writer = csv.writer(f)
        writer.writerow(['FullText', 'Position', 'Bill', 'LastName', 'FirstName', 'Role', 'Organization', 'City', 'State'])
        writer.writerows(witList) # better just to include the text maybe.
    return None

houseDir = '../data/HouseWitness.csv'
export(houseDir, newList)

In [25]:
# duplicating the House process for the Senate

folderName = 'bills/85R/witlistbill/html/senate_bills/'
senateWit = extractRows(folderName)
senList = mergelines(senateWit)
senList = mergelines(senList)
for line in senList:
    line.extend(addName(line))
    line.extend(addTitle(line))
    line.extend(addOrg(line))
    line.extend(addCity(line))
senList = [[changeN(cell) for cell in line] for line in senList]



In [26]:
senateDir = '../data/SenateWitness.csv'
export(senateDir, senList)