In [1]:
"""
legislators_tlo.html was copied from the source of
http://www.legis.state.tx.us/Search/BillSearchLegislatorList.aspx?ID=usrLegislatorsFolder$cboAuthor&Leg=85
"""

import pandas as pd
import re
import numpy as np

# from bs4 import BeautifulSoup

pattern = """\t<option value="(A[0-9]{3,5})">([^,]+)(, (.+))? \((H|S)-A[0-9]{3,5}\)</option>"""

fp = open('../data/inputs/legislators_tlo.html', encoding = 'utf-8')
lines = fp.readlines()

leg = []

for line in lines:
    # Regex applied to each line 
    match = re.search(pattern, line)
    legislator = [match.group(1), match.group(2), match.group(4), match.group(5)]
    if legislator[3] == "H":
        legislator[3] = "STATEREP"
    else: 
        legislator[3] = "STATESEN"
    leg.append(legislator)
        

In [2]:
def substitute(string):
    if type(string) == str:
        string = string.replace("&#233;", "é")
        string = string.replace("&quot;", '"')
        string = string.replace("&#225;", 'á')
        string = string.replace("&#237;", "í")
        string = string.replace("&#241;", "ñ")
    return string

for line in leg:
    line[1] = substitute(line[1])
    line[2] = substitute(line[2])
    if line[1] == "Lucio":
        line[1] = "Lucio Jr."


In [3]:
leg = pd.DataFrame(leg, columns = ["TLO_id","last_name","first_name", "ctaSeekOfficeCd"])

In [4]:
# splitting off nicknames and suffixes in the data from TLO

def parsenickname(first):
    nicknamePattern = ' [\"\(](\w+)[\"\)]$'
    try:
        match = re.search(nicknamePattern, first)
        newLast = first.split(match.group(0))[0]
        return newLast, match.group(1)
    except:
        return first, np.nan
    
def parsesuffix(last):
    suffixPattern = " (Jr\.|Sr\.|III)$"
    try:
        match = re.search(suffixPattern, last)
        newLast = last.split(match.group(0))[0]
        return newLast, match.group(1)
    except:
        return last, np.nan

leg["first_name"], leg["filerNameShort"] = zip(*leg["first_name"].map(parsenickname))
leg["last_name"], leg["suffixes"] = zip(*leg["last_name"].map(parsesuffix))
        
leg[:5]

Unnamed: 0,TLO_id,last_name,first_name,ctaSeekOfficeCd,filerNameShort,suffixes
0,A2100,Allen,,STATEREP,,
1,A2125,Alonzo,,STATEREP,,
2,A2135,Alvarado,,STATEREP,,
3,A2150,Anchia,,STATEREP,,
4,A2155,Anderson,Charles,STATEREP,Doc,


In [5]:
# This loads a file from an Open States bulk download
# Not sure that {"district": object} is the right choice, but I wanted integers with NaN allowed.

openstates = pd.read_csv("../data/inputs/openstates/2017-06-02-tx-csv/tx_legislators.csv", index_col = "leg_id",
                        dtype = {"district": object})

openstates["first_name"], openstates["filerNameShort"] = zip(*openstates["first_name"].map(parsenickname))

In [6]:
# some of the openstates rows include the TLO id as part of the URL to the legislator's photo.

import re

def parseTLO(url):
    urlPattern = "small\/(A[0-9]{3,5})\.jpg$"
    try:
        match = re.search(urlPattern, url)
        return match.group(1)
    except:
        return np.nan
    
openstates["TLO_id"] = (openstates["photo_url"].map(parseTLO))

In [7]:
openstates[openstates["TLO_id"].isnull()][openstates["chamber"] == "upper"]

  if __name__ == '__main__':


Unnamed: 0_level_0,full_name,first_name,middle_name,last_name,suffixes,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort,TLO_id
leg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TXL000198,"Eddie Lucio, Jr.",Eduardo A.,,Lucio,Jr.,,True,tx,upper,27,Democrat,http://www.house.state.tx.us/photos/members/36...,2010-06-19 03:51:42.083000,2017-06-01 10:38:26.283000,Eddie,
TXL000312,José Menéndez,José,,Menéndez,,,True,tx,upper,26,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.467000,2017-06-01 10:38:26.641000,,
TXL000705,Dawn Buckingham,Dawn,,Buckingham,,,True,tx,upper,24,Republican,,2017-01-18 03:03:18.301000,2017-06-01 10:38:26.639000,,
TXL000707,Borris L. Miles,Borris L.,,Miles,,,True,tx,upper,13,Democrat,,2017-01-18 03:03:18.403000,2017-06-01 10:38:26.414000,,
TXL000716,Bryan Hughes,Bryan,,Hughes,,,True,tx,upper,1,Republican,,2017-01-27 05:42:08.204000,2017-06-01 10:38:26.497000,,


In [8]:
# just setting the remaining 5 senators' TLO_id directly.

tloID = {"TXL000198": "A1300", 
        "TXL000312": "A1110",
        "TXL000705": "A1125",
        "TXL000707": "A1115",
        "TXL000716": "A1135"}

for k in tloID.keys():
    openstates.set_value(k, "TLO_id", tloID[k])


In [9]:
openstates[openstates["chamber"] == "upper"]

Unnamed: 0_level_0,full_name,first_name,middle_name,last_name,suffixes,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort,TLO_id
leg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TXL000190,Craig Estes,Craig,,Estes,,,True,tx,upper,30,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.063000,2017-06-01 10:38:26.268000,,A1180
TXL000195,Juan Hinojosa,Juan,,Hinojosa,,,True,tx,upper,20,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.076000,2017-06-01 10:38:26.808000,Chuy,A1250
TXL000196,Joan Huffman,Joan,,Huffman,,,True,tx,upper,17,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.078000,2017-06-01 10:38:26.752000,,A1260
TXL000198,"Eddie Lucio, Jr.",Eduardo A.,,Lucio,Jr.,,True,tx,upper,27,Democrat,http://www.house.state.tx.us/photos/members/36...,2010-06-19 03:51:42.083000,2017-06-01 10:38:26.283000,Eddie,A1300
TXL000199,Jane Nelson,Jane,,Nelson,,,True,tx,upper,12,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.086000,2017-06-01 10:38:26.740000,,A1450
TXL000200,Robert Nichols,Robert,,Nichols,,,True,tx,upper,3,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.088000,2017-06-01 10:38:26.798000,,A1400
TXL000203,Kel Seliger,Kel,,Seliger,,,True,tx,upper,31,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.095000,2017-06-01 10:38:26.762000,,A1460
TXL000206,Carlos Uresti,Carlos I.,,Uresti,,,True,tx,upper,19,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.103000,2017-06-01 10:38:26.767000,,A1605
TXL000208,Kirk Watson,Kirk,,Watson,,,True,tx,upper,14,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.107000,2017-06-01 10:38:26.769000,,A1610
TXL000210,Royce West,Royce,,West,,,True,tx,upper,23,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.112000,2017-06-01 10:38:26.772000,,A1625


In [10]:
tloOpen = pd.merge(leg, openstates.reset_index(), how='left', on='TLO_id')

In [11]:
senate = tloOpen[tloOpen["ctaSeekOfficeCd"] == "STATESEN"]

In [12]:
# This loads a file from the Texas Election Commission

tx = pd.read_csv("../data/inputs/TEC_CF_CSV/filers.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
tx = tx[tx["filerTypeCd"] == 'COH']

In [14]:
# removed 'STATEREP',  to focus on Senate
# removed [tx['ctaSeekOfficeCd'].isin(['STATESEN'], changed to filerHoldOfficeCd 

txRep = tx[tx['filerHoldOfficeCd'].isin(['STATESEN'])][tx['filerFilerpersStatusCd'] != "NOT_OFFICEHOLDER"]



In [15]:
senate[:5]

Unnamed: 0,TLO_id,last_name_x,first_name_x,ctaSeekOfficeCd,filerNameShort_x,suffixes_x,leg_id,full_name,first_name_y,middle_name,...,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort_y
150,A1055,Bettencourt,,STATESEN,,,TXL000482,Paul Bettencourt,Paul,,...,,True,tx,upper,7,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2015-01-14 00:53:27.393000,2017-06-01 10:38:26.745000,
151,A1080,Birdwell,,STATESEN,,,TXL000364,Brian Birdwell,Brian,,...,,True,tx,upper,22,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-07-17 08:38:47.948000,2017-06-01 10:38:26.747000,
152,A1125,Buckingham,,STATESEN,,,TXL000705,Dawn Buckingham,Dawn,,...,,True,tx,upper,24,Republican,,2017-01-18 03:03:18.301000,2017-06-01 10:38:26.639000,
153,A1090,Burton,,STATESEN,,,TXL000487,Konni Burton,Konni,,...,,True,tx,upper,10,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2015-01-14 00:53:27.536000,2017-06-01 10:38:26.492000,
154,A1010,Campbell,,STATESEN,,,TXL000419,Donna Campbell,Donna,,...,,True,tx,upper,25,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2013-01-09 00:29:35.053000,2017-06-01 10:38:26.288000,


In [16]:
txRep[:5]

Unnamed: 0,recordType,filerIdent,filerTypeCd,filerName,unexpendContribFilerFlag,modifiedElectCycleFlag,filerJdiCd,committeeStatusCd,ctaSeekOfficeCd,ctaSeekOfficeDistrict,...,chairMailingAddr2,chairMailingCity,chairMailingStateCd,chairMailingCountyCd,chairMailingCountryCd,chairMailingPostalCode,chairMailingRegion,chairPrimaryUsaPhoneFlag,chairPrimaryPhoneNumber,chairPrimaryPhoneExt
40,FILER,13805,COH,"Hinojosa, Juan (The Honorable)",N,N,,,STATESEN,20,...,,,,,,,,,,
1772,FILER,19581,COH,"Whitmire, John (The Honorable)",N,N,,,STATESEN,15,...,,,,,,,,,,
2306,FILER,20257,COH,"Lucio Jr., Eduardo A. (The Honorable)",N,N,,,STATESEN,27,...,,,,,,,,,,
2691,FILER,20673,COH,"Nelson, Jane (The Honorable)",N,N,,,STATESEN,12,...,,,,,,,,,,
2974,FILER,20971,COH,"Zaffirini, Judith (The Honorable)",N,N,,,STATESEN,21,...,,,,,,,,,,


In [17]:
txRep = txRep.rename(columns={"filerHoldOfficeDistrict": "district"})

# txRep["district"].describe()



In [18]:
allThree = pd.merge(senate, txRep, how='left', on = 'district')

In [19]:
allThree[["first_name_x", "last_name_x", "full_name", "first_name_y", "last_name_y", 
          "filerName"]]

Unnamed: 0,first_name_x,last_name_x,full_name,first_name_y,last_name_y,filerName
0,,Bettencourt,Paul Bettencourt,Paul,Bettencourt,"Bettencourt, Paul (The Honorable)"
1,,Birdwell,Brian Birdwell,Brian,Birdwell,"Birdwell, Brian D. (The Honorable)"
2,,Buckingham,Dawn Buckingham,Dawn,Buckingham,"Buckingham, Dawn C. (The Honorable)"
3,,Burton,Konni Burton,Konni,Burton,"Burton, Konni L. (The Honorable)"
4,,Campbell,Donna Campbell,Donna,Campbell,"Campbell M.D., Donna (The Honorable)"
5,,Creighton,Brandon Creighton,Brandon,Creighton,"Creighton, C. Brandon (The Honorable)"
6,,Estes,Craig Estes,Craig,Estes,"Estes, Craig (The Honorable)"
7,,Garcia,Sylvia R. Garcia,Sylvia R.,Garcia,"Garcia, Sylvia R. (The Honorable)"
8,,Hall,Bob Hall,Bob,Hall,"Hall III, Robert L. (The Honorable)"
9,,Hancock,Kelly Hancock,Kelly,Hancock,"Hancock, Kelly G. (The Honorable)"


In [20]:
def s (string):
    return string.strip()

allThree["full_name"] = allThree["full_name"].apply(s)

In [21]:
answer = allThree[pd.notnull(allThree['filerName'])]

In [22]:
answer[:1]

Unnamed: 0,TLO_id,last_name_x,first_name_x,ctaSeekOfficeCd_x,filerNameShort_x,suffixes_x,leg_id,full_name,first_name_y,middle_name,...,chairMailingAddr2,chairMailingCity,chairMailingStateCd,chairMailingCountyCd,chairMailingCountryCd,chairMailingPostalCode,chairMailingRegion,chairPrimaryUsaPhoneFlag,chairPrimaryPhoneNumber,chairPrimaryPhoneExt
0,A1055,Bettencourt,,STATESEN,,,TXL000482,Paul Bettencourt,Paul,,...,,,,,,,,,,


In [23]:
answerShort = answer[["TLO_id", "leg_id", "filerIdent", "full_name", "first_name_y", "filerNameShort", "last_name_x", "suffixes_x", "party", "district"]]

In [24]:
answerShort = answerShort.rename(columns={"first_name_y": "first_name", "last_name_x": "last_name", 
                                "filerNameShort": "nickname", "suffixes_x": "suffixes"})

In [25]:
answerShort["filerIdent"] = answerShort["filerIdent"].astype(int)

In [26]:
answerShort = answerShort.set_index("TLO_id")

In [27]:
answerShort.to_csv("../data/senateCrosswalk.csv")