In [1]:
# The purpose of this notebook is to get a CSV of tabular data based on the witness lists.

# I used wget to collect ftp.legis.state.tx.us/bills/85R/witlistbill/html/senate_bills/
# and put them all together in a directory called '/bills/85R/witlistbill/html/senate_bills/'
# These files are not being stored in the github repo because they're too big.

# The published lists are in HTML exported from Microsoft Word, so the markup reflects 
# intended page layout rather than content.

from bs4 import BeautifulSoup
import re

In [2]:
endWithState = re.compile(r'[^\(](AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL|IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)\n?$')

In [3]:
def addName(line):
    flags = [re.IGNORECASE, re.DOTALL]
    regexes = [r"^((?:[\w\'\-]+\s)*[\w\'\-]+),\n((?:\w+\s)*\w+)\xa0\xa0 ", 
              r"^((?:[\w\'\-]+\s)*[\w\'\-]+),\n((?:\w+\s)*\w+)\s+\(", 
              r"^((?:[\w\'\-]+\s)*[\w\'\-]+),[\n\s]+((?:[A-Z]\.)+)\s\s",
              r'^((?:\w+\s)*\w+),\s+([^\(\s]+) ', 
              r'^((?:\w+\s)*\w+),\s+((?:\w+\s)*\w+)  '] # third one for initials as first name
    partial = r'^(\w+)\s+\n\('  # Matches where someone put just a surname
    for f in flags:
        for r in regexes:
            nameRe = re.compile(r, f)
            match = re.search(nameRe, line[0])
            if match:
                extension = [match.group(1).strip(), match.group(2).strip()]
                return extension
    
    # A separate case for the partial match
    nameRe = re.compile(partial)
    if re.search(nameRe, line[0]):
        match = re.search(nameRe, line[0])
        extension = [match.group(1).strip(), None]
        return extension
    
    # And a third case to leave the fields blank
    extension = [None, None]
    return extension

def addTitle(line):
    regexes = [r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(also providing', r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s+(.+)\(also providing',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s+(.+)\('
              #  r'^.+,\n.+\s+(.+)\s+\(also providing', r'^.+,\n.+\s+(.+)\s+\(',
              # ',\n?.+\xa0\xa0 ((?:\w+\s)*\w+)\s+(\(also providing written testimony\))?\s+\('
              ]
    for r in regexes:
        titleRe = re.compile(r)
        match = re.search(titleRe, line[0])
        if match:
            extension = [match.group(1).strip()]
            return extension
    extension = [None]
    return extension

def addOrg(line):
    flags = [re.IGNORECASE, re.DOTALL]
    regexes = [r'written testimony\)\s+\((.+)\)\s*(,|$)', 
              r'\s+\((.+)\)\s*(,|$)',]
    for f in flags:
        for r in regexes:
            orgRe = re.compile(r, f)
            match = re.search(orgRe, line[0])
            if match:
                extension = [match.group(1).strip()]
                return extension
    extension = [None]
    return extension

def addCity(line):
    extension = [None, None]
    cityRe = re.compile(r'\)\s?,\s(.*),\s(.*)$')
    match = re.search(cityRe, line[0])
    if match:
        extension = [match.group(1).strip(), match.group(2).strip()]
    return extension

addName(["Parks,\nUrsula (Legislative Budget Board)","HB 2","For"])

['Parks', 'Ursula']

In [4]:
# I haven't made a corresponding function for Senate bills. Instead I
# reused this same function, which means the Senate data probably has
# more errors.

def HBWitness(witnessList):
    fp = open(witnessList, encoding = 'cp1252')
    soup = BeautifulSoup(fp.read(), "html.parser")
    
    # each printed page has a class with Wordsection plus a number
    pages = soup.select("div[class^=WordSection]") 
    
    indent = re.compile(r"^\s{30}") # whitespace should mean it's a listing, page number, or the heading "WITNESS LIST"
    forStance = re.compile(r'^\s+FOR\s?:', re.IGNORECASE) # i flag means case-insensitive
    againstStance = re.compile(r'^\s+AGAINST\s?:', re.IGNORECASE)
    onStance = re.compile(r'^\s+ON\s?:', re.IGNORECASE)

    wit = []
    stance = "For"
    for page in pages:
        witpage = []
        pp = page.select("p")
        for line in pp:
            text = line.get_text()
            if re.match(forStance, text):
                stance = "For"
            if re.match(againstStance, text):
                stance = "Against"
            if re.match(onStance, text):
                stance = "On"
            if re.match(indent, text):
                witpage.append([line.get_text(), stance])
        wit.extend(witpage[1:-1])

    bill = soup.find('span', {'style':"color:windowtext;text-decoration:none"}) # identifies the bill
    bill = bill.get_text()
    for line in wit:
        line.extend([bill.strip()])
    
    wit = [x for x in wit if x != None]
    
    return wit


In [5]:
def mergelines(wit):
    changed = 0    
    newList = []
    badList = []
    tooShort = re.compile("^\w+\)?,?\s?(AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL\
                        |IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC\
                        |ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)?\n?$") # line shouldn't have just one word
    for line in wit:
        line[0] = line[0].strip()
    
    for line in wit[:-1]:
        lineIndex = wit.index(line)
        
        # Lines will get merged if they pass any of the three tests. Only if not will a line be accepted.
        
        if re.search(endWithState, wit[lineIndex + 1][0]) and not re.search(endWithState, line[0]) \
        and addName(line) != [None, None] and addName(wit[lineIndex + 1]) == [None, None]:
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
        
        elif re.search(tooShort, wit[lineIndex + 1][0].strip()):
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        elif line[0].count(')') != line[0].count('(') \
        and wit[lineIndex + 1][0].count(')') != wit[lineIndex + 1][0].count('(') \
        and addName(line) != [None, None] and not re.search(endWithState, line[0]):
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        elif line[0].count(')') != line[0].count('(') \
        and wit[lineIndex + 1][0].count(')') != wit[lineIndex + 1][0].count('(') \
        and addName(line) != [None, None] \
        and addName(wit[lineIndex + 1]) == [None,None]:
            newList.append([line[0] + " " + wit[lineIndex + 1][0], line[1], line[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        else:
            newList.append(line[0:3])
    
    answer = [x for x in newList if x not in badList]
    
    return answer


In [6]:
import shutil, os

def extractRows(folderName):
    
    houseWit = []
    for folderName, subfolders, filenames in os.walk(folderName):
        for filename in filenames:
            if filename != ".DS_Store":
                source = folderName + filename
                wit = HBWitness(source)
                # print(str(changed) + " lines changed.")
                houseWit.extend(wit)
    return houseWit
            
folderName = 'bills/85R/witlistbill/html/house_bills/'
houseWit = extractRows(folderName)

In [7]:
# houseWit[:5]

In [8]:
# trying to rejoin entries split across lines
newList = mergelines(houseWit)

newList = mergelines(newList) # Will doing it twice catch 3-line entries?

In [9]:
# Adding the last six fields to each row

for line in newList:
    line.extend(addName(line))
    line.extend(addTitle(line))
    line.extend(addOrg(line))
    line.extend(addCity(line))

In [10]:
# pandas is only used for data exploration. 
# There are definitely still errors.

import pandas as pd
pd.options.display.max_colwidth = 500


df = pd.DataFrame(newList, columns=list(['raw', 'position', 'bill', 'last','first','title', 'org','city','state']))

In [11]:
# df[df['bill'] == "HB 3025"]

In [12]:
# These are the lines most likely to have problems

df[pd.isnull(df['org'])]
# df.first.isnull()

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
24,"Van\nDen Bent, Jerre Physical therapist (also providing written testimony) \n(Therapy",On,HB 2,,,Physical therapist,,,
25,"2000),\nDallas, TX",On,HB 2,,,,,Dallas,TX
296,"Roman\nMD, Heidi Ambulatory Medical Director (Rees-Jones Center for Foster Care",On,HB 7,,,Ambulatory Medical Director,,,
297,"Excellence\nat Children's Health and UT Southwestern Medical Center), Dallas, TX",On,HB 7,,,,,Dallas,TX
982,"Ramon\nButts, Edna Director, Intergovernmental Relations & Policy Oversight \n(Austin",On,HB 21,,,"Director, Intergovernmental Relations & Policy Oversight",,,
983,"ISD),\nAustin, TX",On,HB 21,,,,,Austin,TX
5232,"Houston,\nTx, TX",For,HB 490,Houston,"Tx,",,,,
5260,"Gear\nJr., Bob Director, Texas Veterans Leadership Program (Texas Workforce",On,HB 493,,,"Director, Texas Veterans Leadership Program",,,
5261,"Commission),\nAustin, TX",On,HB 493,,,,,Austin,TX
5436,"Morrow\nPhD, Rosemary Clinical Asst. Professor; UTeach-Liberal Arts Program, The",On,HB 515,,,,,,


In [13]:
# df[df['bill'] == "HB 2"]

In [14]:
# Looking at a random sample

df.sample(n = 40)

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
16692,"Allred,\nAnna (Texas Society of Anesthesiologists)",For,HB 2397,Allred,Anna,,Texas Society of Anesthesiologists,,
1041,"White,\nMichael (Texas Construction Association)",For,HB 22,White,Michael,,Texas Construction Association,,
15185,"Parkinson,\nThomas (Self)",For,HB 2115,Parkinson,Thomas,,Self,,
7323,"Suhm,\nVic (Self; Tarrant Regional Transportation Coalition)",For,HB 890,Suhm,Vic,,Self; Tarrant Regional Transportation Coalition,,
23304,"Wiggins,\nMark (Association of Texas Professional Educators)",For,HB 3526,Wiggins,Mark,,Association of Texas Professional Educators,,
17153,"Francis,\nWill (National Association of Social Workers - Texas Chapter), Austin, TX",For,HB 2466,Francis,Will,,National Association of Social Workers - Texas Chapter,Austin,TX
11142,"Travis,\nClayton (Texas Pediatric Society)",For,HB 1549,Travis,Clayton,,Texas Pediatric Society,,
20413,"Conkwright,\nJim (Prairielands Groundwater Conservation District)",For,HB 3025,Conkwright,Jim,,Prairielands Groundwater Conservation District,,
5537,"Green,\nJoseph (Travis County Commissioners Court)",Against,HB 532,Green,Joseph,,Travis County Commissioners Court,,
9796,"Sanford,\nTiana (Montgomery County District Attorney's Office)",For,HB 1357,Sanford,Tiana,,Montgomery County District Attorney's Office,,


In [15]:
def changeN(cell):
    if cell != None:
        cell = cell.replace("\n", " ")
    return cell
    
newList = [[changeN(cell) for cell in line] for line in newList]
    

In [16]:
import csv

def export(dir, witList):
    with open(dir,'w') as f:
        writer = csv.writer(f)
        writer.writerow(['FullText', 'Position', 'Bill', 'LastName', 'FirstName', 'Role', 'Organization', 'City', 'State'])
        writer.writerows(witList) # better just to include the text maybe.
    return None

houseDir = '../data/witness-lists/HouseWitness.csv'
export(houseDir, newList)

In [17]:
# duplicating the House process for the Senate

folderName = 'bills/85R/witlistbill/html/senate_bills/'
senateWit = extractRows(folderName)
senList = mergelines(senateWit)
senList = mergelines(senList)
for line in senList:
    line.extend(addName(line))
    line.extend(addTitle(line))
    line.extend(addOrg(line))
    line.extend(addCity(line))
senList = [[changeN(cell) for cell in line] for line in senList]



In [18]:
senateDir = '../data/witness-lists/SenateWitness.csv'
export(senateDir, senList)

In [19]:
# Just looking at a sample of the data. Maybe 95% correct.

sen = pd.DataFrame(senList, columns=list(['raw', 'position', 'bill', 'last','first','title', 'org','city','state']))
sen.sample(n = 40)

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
17242,"Herring, Megan (First 3 Years)",For,SB 1839,Herring,Megan,,First 3 Years,,
16897,"Garcia, Imelda Director, Infectious Disease Prevention (Dept. of State Health Services), Austin, TX",On,SB 1683,Garcia,Imelda,"Director, Infectious Disease Prevention",Dept. of State Health Services,Austin,TX
15876,"Bostur, Jill (Self) , Austin, TX",Against,SB 1443,Bostur,Jill,,Self,Austin,TX
7909,"Eaton, Holly Director of Professional Development & Advocacy (TX Classroom Teachers Assoc.), Austin, TX",For,SB 160,Eaton,Holly,Director of Professional Development & Advocacy,TX Classroom Teachers Assoc.,Austin,TX
14090,"Bowden, Rachel (Texas Department of Insurance), Austin, TX",On,SB 1076,Bowden,Rachel,,Texas Department of Insurance,Austin,TX
6796,"Gibson, Judson (Self) , Austin, TX",Against,SB 13,Gibson,Judson,,Self,Austin,TX
4894,"Dallas, James (Self) , Austin, TX",Against,SB 6,Dallas,James,,Self,Austin,TX
14028,"Kennedy, Ted Deputy Head Of State Governmental Affairs / AIG (AIG), Houston, TX",For,SB 1071,Kennedy,Ted,Deputy Head Of State Governmental Affairs / AIG,AIG,Houston,TX
7540,"Kimberly, Knox SVP - Advocacy and Stakeholder Relations (Upbring), Austin, TX",For,SB 74,Kimberly,Knox,SVP - Advocacy and Stakeholder Relations,Upbring,Austin,TX
10045,"Lovatos, Amber (Self; TDHA), Houston, TX",For,SB 430,Lovatos,Amber,,Self; TDHA,Houston,TX


In [20]:
sen.describe()

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
count,18899,18899,18899,18485,18453,16724.0,18745,15811,15811
unique,13439,3,978,5498,2668,3265.0,4974,815,34
top,"Banks, Yannis (Texas NAACP), Austin, TX",For,SB 4,Smith,John,,Self,Austin,TX
freq,34,8116,2358,184,307,7289.0,6119,9378,15703
