In [1]:
# The purpose of this notebook is to get a CSV of tabular data based on the witness lists.

# I used wget to collect ftp.legis.state.tx.us/bills/85R/witlistbill/html/
# and put them all together in a directory called '/bills/85R/witlistbill/html/'
# These files are not being stored in the github repo because they're too big.

# The published lists are in HTML exported from Microsoft Word, so the markup reflects 
# intended page layout rather than content.

from bs4 import BeautifulSoup
import re

In [2]:
endWithState = re.compile(r'[^\(](AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL|IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)\n?$')

In [3]:
def addName(line):
    """
    Takes in a string of text representing a line from a witness list file and, if it's able to identify
    the witness's name, returns a list containing a string for the last name and a string for the
    first name. The list contains "None" values if no name is found, or one string and one None if only
    a surname is found.
    """
    
    flags = [re.IGNORECASE, re.DOTALL]
    regexes = [r"^((?:[\w\'\-]+\s)*[\w\'\-]+),\n((?:\w+\s)*\w+)\xa0\xa0 ", 
              r"^((?:[\w\'\-]+\s)*[\w\'\-]+),\n((?:\w+\s)*\w+)\s+\(", 
              r"^((?:[\w\'\-]+\s)*[\w\'\-]+),[\n\s]+((?:[A-Z]\.)+)\s\s",
              r'^((?:\w+\s)*\w+),\s+([^\(\s]+) ', 
              r'^((?:\w+\s)*\w+),\s+((?:\w+\s)*\w+)  '] # third one for initials as first name
    partial = r'^(\w+)\s+\n\('  # Matches where someone put just a surname
    for f in flags:
        for r in regexes:
            nameRe = re.compile(r, f)
            match = re.search(nameRe, line)
            if match:
                return [match.group(1).strip(), match.group(2).strip()]
    
    # A separate case for the partial match
    nameRe = re.compile(partial)
    if re.search(nameRe, line):
        match = re.search(nameRe, line)
        return [match.group(1).strip(), None]
    
    # And a third case to leave the fields blank
    return [None, None]

def addTitle(line):
    """
    Takes in a string of text representing a line from a witness list file and, if it's able to identify
    the witness's job title, returns it as a string. Returns None otherwise.
    """
    
    regexes = [r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(also providing', r',\s(?:(?:\w+\s)*\w+)\s+(.+)\s*\(',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s+(.+)\(also providing',
               '^(?:(?:\w+\s)*\w+),\s+(?:(?:\w+\s)*\w+)\s\s+(.+)\('
              #  r'^.+,\n.+\s+(.+)\s+\(also providing', r'^.+,\n.+\s+(.+)\s+\(',
              # ',\n?.+\xa0\xa0 ((?:\w+\s)*\w+)\s+(\(also providing written testimony\))?\s+\('
              ]
    for r in regexes:
        titleRe = re.compile(r)
        match = re.search(titleRe, line)
        if match:
            return match.group(1).strip()
    return None

def addOrg(line):
    """
    Takes in a string of text representing a line from a witness list file and, if it's able to identify
    the witness's organization, returns it as a string. Returns None otherwise.
    """
    
    flags = [re.IGNORECASE, re.DOTALL]
    regexes = [r'written testimony\)\s+\((.+)\)\s*(,|$)', 
              r'\s+\((.+)\)\s*(,|$)',]
    for f in flags:
        for r in regexes:
            orgRe = re.compile(r, f)
            match = re.search(orgRe, line)
            if match:
                return match.group(1).strip()
    return None

def addCity(line):
    """
    Takes in a string of text representing a line from a witness list file and, if it's able to identify
    the witness's city and state, returns a list containing a string for the last name and a string for the
    first name. The list contains "None" values otherwise.
    """
    
    extension = []
    cityRe = re.compile(r'\)\s?,\s(.*),\s(.*)$')
    match = re.search(cityRe, line)
    if match:
        return [match.group(1).strip(), match.group(2).strip()]
    return [None, None]


In [4]:
# I haven't made a corresponding function for Senate bills. Instead I
# reused this same function, which means the Senate data probably has
# more errors.

def HBWitness(witnessList):
    """
    Input is an HTML file from the Texas Legislature representing a witness list. The function iterates through
    the lines of HTML and populates a list with the lines representing witnesses. Each item in the list includes
    the text of the line of HTML, the word "For" "Against" or "On" depending on which section of the list the line
    came from, and the bill number. The function return the list.
    """
    
    fp = open(witnessList, encoding = 'cp1252')
    soup = BeautifulSoup(fp.read(), "html.parser")
    
    # each printed page has a class with Wordsection plus a number
    pages = soup.select("div[class^=WordSection]") 
    
    indent = re.compile(r"^\s{30}") # whitespace should mean it's a listing, page number, or the heading "WITNESS LIST"
    forStance = re.compile(r'^\s+FOR\s?:', re.IGNORECASE) # i flag means case-insensitive
    againstStance = re.compile(r'^\s+AGAINST\s?:', re.IGNORECASE)
    onStance = re.compile(r'^\s+ON\s?:', re.IGNORECASE)

    wit = []
    stance = "For"
    for page in pages:
        witpage = []
        pp = page.select("p")
        for line in pp:
            text = line.get_text()
            if re.match(forStance, text):
                stance = "For"
            if re.match(againstStance, text):
                stance = "Against"
            if re.match(onStance, text):
                stance = "On"
            if re.match(indent, text):
                witpage.append([line.get_text(), stance])
        wit.extend(witpage[1:-1])

    bill = soup.find('span', {'style':"color:windowtext;text-decoration:none"}) # identifies the bill
    bill = bill.get_text()
    for line in wit:
        line.extend([bill.strip()])
    
    wit = [x for x in wit if x != None]
    
    return wit


In [5]:
def mergelines(wit):
    """
    Input is a list of lists representing lines of text in a witness list document from the Legislature. 
    This function tries to determine whether whether consecutive lines refer to the same witness and need
    to be merged into one row of data. It returns a new list of lists, with those lines merged together.
    """
    
    changed = 0    
    newList = []
    badList = []
    tooShort = re.compile("^\w+\)?,?\s?(AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL\
                        |IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC\
                        |ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|Texas|UT|VT|VI|VA|WA|WV|WI|WY)?\n?$") # line shouldn't have just one word
    for row in wit:
        row[0] = row[0].strip()
    
    for row in wit[:-1]:
        lineIndex = wit.index(row)
        
        # Lines will get merged if they pass any of the three tests. Only if not will a line be accepted.
        
        if re.search(endWithState, wit[lineIndex + 1][0]) and not re.search(endWithState, row[0]) \
        and addName(row[0]) != [None, None] and addName(wit[lineIndex + 1][0]) == [None, None]:
            newList.append([row[0] + " " + wit[lineIndex + 1][0], row[1], row[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
        
        elif re.search(tooShort, wit[lineIndex + 1][0].strip()):
            newList.append([row[0] + " " + wit[lineIndex + 1][0], row[1], row[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        elif row[0].count(')') != row[0].count('(') \
        and wit[lineIndex + 1][0].count(')') != wit[lineIndex + 1][0].count('(') \
        and addName(row[0]) != [None, None] and not re.search(endWithState, row[0]):
            newList.append([row[0] + " " + wit[lineIndex + 1][0], row[1], row[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        elif row[0].count(')') != row[0].count('(') \
        and wit[lineIndex + 1][0].count(')') != wit[lineIndex + 1][0].count('(') \
        and addName(row[0]) != [None, None] \
        and addName(wit[lineIndex + 1][0]) == [None,None]:
            newList.append([row[0] + " " + wit[lineIndex + 1][0], row[1], row[2]])
            badList.append(wit[lineIndex + 1][0:3])
            changed += 1
            
        else:
            newList.append(row[0:3])
    
    answer = [x for x in newList if x not in badList]
    
    return answer


In [6]:
import shutil, os

def extractRows(folderName):
    """
    Input is the name of a folder full of witness list files. The function extracts the lines from each file,
    attempts to merge them into lists representing rows of data referring to one witness each, and returns 
    one long list of lists.
    """
    
    houseWit = []
    for folderName, subfolders, filenames in os.walk(folderName):
        for filename in filenames:
            if filename != ".DS_Store":
                source = folderName + filename
                wit = HBWitness(source)
                # trying to rejoin entries split across lines
                new = mergelines(wit)
                new = mergelines(new) # Will doing it twice catch 3-line entries?
                houseWit.extend(new)
    return houseWit
            
folderName = 'bills/85R/witlistbill/html/house_bills/'
houseWit = extractRows(folderName)

In [7]:
# Adding the last six fields to each row

def extendRow(row):
    """
    Input is a list representing a row of data about a witness. The function extends the list 
    with six more items and returns the extended list.
    """
    
    row.extend(addName(row[0]))
    row.append(addTitle(row[0]))
    row.append(addOrg(row[0]))
    row.extend(addCity(row[0]))
    return row

houseRows = [extendRow(row) for row in houseWit]

In [8]:
houseRows[:5]

[['Colyandro,\nJohn (Texas Conservative Coalition)',
  'For',
  'HB 3081',
  'Colyandro',
  'John',
  None,
  'Texas Conservative Coalition',
  None,
  None],
 ['Minick,\nStephen (Texas Asssociation of Business)',
  'For',
  'HB 3081',
  'Minick',
  'Stephen',
  None,
  'Texas Asssociation of Business',
  None,
  None],
 ['Norcross,\nRob (Consumer service alliance of texas)',
  'For',
  'HB 3081',
  'Norcross',
  'Rob',
  None,
  'Consumer service alliance of texas',
  None,
  None],
 ['Scurlock,\nStephen (Independent Bankers Association of Texas)',
  'For',
  'HB 3081',
  'Scurlock',
  'Stephen',
  None,
  'Independent Bankers Association of Texas',
  None,
  None],
 ['Allmon,\nJennifer (The Texas Catholic Conference of Bishops)',
  'Against',
  'HB 3081',
  'Allmon',
  'Jennifer',
  None,
  'The Texas Catholic Conference of Bishops',
  None,
  None]]

In [9]:
# pandas is only used for data exploration. 
# There are definitely still errors.

import pandas as pd
pd.options.display.max_colwidth = 500


df = pd.DataFrame(houseRows, columns=list(['raw', 'position', 'bill', 'last','first','title', 'org','city','state']))

In [10]:
# df[df['bill'] == "HB 3025"]

In [11]:
# These are the lines most likely to have problems

df[pd.isnull(df['org'])]
# df.first.isnull()

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
1289,"Neavel,\nMD, Celia (Self; Texas Pediatric Society, Texas Medical Association, Texas",For,HB 2604,Neavel,"MD,",,,,
1290,"Association\nof Obstretics and Gynecology, Texas Academy of Family Practice, March of Dimes)",For,HB 2604,Association\nof Obstretics and Gynecology,Texas,,,,
1309,"Academy\nof Family Physicians, MOD, TX Care for Childre)",For,HB 2604,Academy\nof Family Physicians,"MOD,",,,,
2710,"Neavel,\nCelia Dr (also providing written testimony) (Self; Texas Pediatric Society,",For,HB 2466,Neavel,Celia,Dr,,,
2711,"Texas\nMedical Asocciation, Texas Academy of Family Physicians, March of Dimes, Texas",For,HB 2466,Texas\nMedical Asocciation,Texas,,,,
2712,"Association\nof Obstetrics and Gynecology), Austin, TX",For,HB 2466,,,,,Austin,TX
4004,"Roman\nMD, Heidi Ambulatory Medical Director (Rees-Jones Center for Foster Care",On,HB 7,,,Ambulatory Medical Director,,,
4005,"Excellence\nat Children's Health and UT Southwestern Medical Center), Dallas, TX",On,HB 7,,,,,Dallas,TX
8683,"Morrow\nPhD, Rosemary Clinical Asst. Professor; UTeach-Liberal Arts Program, The",On,HB 515,,,,,,
10662,"Hopper,\nCraig Chair, Estate & Trust Legislative Affairs Committee, Real Estate\nProbate",For,HB 1974,Hopper,Craig,,,,


In [12]:
# df[df['bill'] == "HB 2"]

In [13]:
# Looking at a random sample

df.sample(n = 40)

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
6957,"Noble,\nShannon (Texas Counseling Association)",Against,HB 3859,Noble,Shannon,,Texas Counseling Association,,
978,"Lee,\nDonald (Texas Conference of Urban Counties)",For,HB 382,Lee,Donald,,Texas Conference of Urban Counties,,
17954,"Russell,\nClaudia (El Paso County)",Against,HB 1417,Russell,Claudia,,El Paso County,,
9783,"Frandsen,\nChris (Self)",For,HB 1486,Frandsen,Chris,,Self,,
15549,"Skaggs,\nJason (Texas and Southwestern Cattle Raisers Association)",For,HB 3417,Skaggs,Jason,,Texas and Southwestern Cattle Raisers Association,,
14688,"Wachs,\nMichael (Self) , Houston, TX",For,HB 490,Wachs,Michael,,Self,Houston,TX
2300,"Stevens,\nAlan (Baylor scott and white health and Texas hospital association)",For,HB 2425,Stevens,Alan,,Baylor scott and white health and Texas hospital association,,
1819,"Raun,\nLowell (Self; Texas rice producers group, coastal bend gcd)",For,HB 3166,Raun,Lowell,,"Self; Texas rice producers group, coastal bend gcd",,
13603,"Arellano,\nRobert (Self; Tejano music awards)",For,HB 2495,Arellano,Robert,,Self; Tejano music awards,,
21297,"Gustafson,\nLindsay (Texas Classroom Teachers Association)",For,HB 3684,Gustafson,Lindsay,,Texas Classroom Teachers Association,,


In [14]:
def changeN(cell):
    if cell != None:
        cell = cell.replace("\n", " ")
    return cell
    
houseRows = [[changeN(cell) for cell in line] for line in houseRows]
    

In [15]:
import csv

def export(dir, witList):
    with open(dir,'w') as f:
        writer = csv.writer(f)
        writer.writerow(['FullText', 'Position', 'Bill', 'LastName', 'FirstName', 'Role', 'Organization', 'City', 'State'])
        writer.writerows(witList) # better just to include the FullText field.
    return None

houseDir = '../data/witness-lists/HouseWitness.csv'
export(houseDir, houseRows)

In [16]:
# duplicating the House process for the Senate

folderName = 'bills/85R/witlistbill/html/senate_bills/'
senateWit = extractRows(folderName)
senList = [extendRow(row) for row in senateWit]
senList = [[changeN(cell) for cell in row] for row in senList]

In [17]:
senateDir = '../data/witness-lists/SenateWitness.csv'
export(senateDir, senList)

In [18]:
# Just looking at a sample of the data. Maybe 95% correct.

sen = pd.DataFrame(senList, columns=list(['raw', 'position', 'bill', 'last','first','title', 'org','city','state']))
sen.sample(n = 40)

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
5456,"Stewart, Kevin (Texas Nurses Association), Austin, TX",For,SB 922,Stewart,Kevin,,Texas Nurses Association,Austin,TX
7832,"Freeman, Ray Executive Director (Equity Center), Austin, TX",For,SB 2142,Freeman,Ray,Executive Director,Equity Center,Austin,TX
16906,"Gore, Rex Citizen/Charter board member (Self) , Austin, TX",For,SB 1480,Gore,Rex,Citizen/Charter board member,Self,Austin,TX
15850,"Sanders, Cynthia (Self) , Austin, TX",Against,SB 4,Sanders,Cynthia,,Self,Austin,TX
15588,"Moehnke, Amy (Self) , Austin, TX",Against,SB 4,Moehnke,Amy,,Self,Austin,TX
6445,"Kelley, Scott Executive Vice Chancellor (UT System Administration), Austin, TX",On,SB 1,Kelley,Scott,Executive Vice Chancellor,UT System Administration,Austin,TX
11524,"Banks, Yannis (Texas NAACP), Austin, TX",For,SB 1849,Banks,Yannis,,Texas NAACP,Austin,TX
6734,"Dell'Antonio, Andrew University Professor (Myself and my daughter), Round Rock",On,SB 1,Dell'Antonio,Andrew,University Professor,Myself and my daughter,,
12738,"Lee, Donald Executive Director (Texas Conference of Urban Counties), Austin, TX",For,SB 674,Lee,Donald,Executive Director,Texas Conference of Urban Counties,Austin,TX
13331,"Rau, Seth Legislative Coordinator (San Antonio ISD), San Antonio, TX",On,SB 1398,Rau,Seth,Legislative Coordinator,San Antonio ISD,San Antonio,TX


In [19]:
sen.describe()

Unnamed: 0,raw,position,bill,last,first,title,org,city,state
count,17035,17035,17035,16641,16610,15141.0,16883,14267,14267
unique,12500,3,827,5304,2607,3014.0,4623,799,33
top,"Banks, Yannis (Texas NAACP), Austin, TX",Against,SB 4,Smith,John,,Self,Austin,TX
freq,34,7825,2354,154,278,6904.0,5990,8272,14165
