Revising the Senate file to cover both groups of legislators.

This is going to link together three datasets: Texas Legislature Online (TLO), Texas Election Commission (TEC),
and OpenStates.

Loading TLO first.

legislators_tlo.html was copied from the source of
http://www.legis.state.tx.us/Search/BillSearchLegislatorList.aspx?ID=usrLegislatorsFolder$cboAuthor&Leg=85

In [1]:
import pandas as pd
import re
import numpy as np

pattern = """\t<option value="(A[0-9]{3,5})">([^,]+)(, (.+))? \((H|S)-A[0-9]{3,5}\)</option>"""

fp = open('../data/inputs/legislators_tlo.html', encoding = 'utf-8')
lines = fp.readlines()

leg = []

for line in lines:
    # Regex applied to each line 
    match = re.search(pattern, line)
    legislator = [match.group(1), match.group(2), match.group(4), match.group(5)]
    if legislator[3] == "H":
        legislator[3] = "STATEREP"
    else: 
        legislator[3] = "STATESEN"
    leg.append(legislator)
        

In [2]:
# Cleaning text in TLO dataset

def substitute(string):
    if type(string) == str:
        string = string.replace("&#233;", "é")
        string = string.replace("&quot;", '"')
        string = string.replace("&#225;", 'á')
        string = string.replace("&#237;", "í")
        string = string.replace("&#241;", "ñ")
        string = string.replace("\t", "")
    return string

for line in leg:
    line[1] = substitute(line[1])
    line[2] = substitute(line[2])
    if line[1] == "Lucio":
        line[1] = "Lucio Jr."


In [3]:
# Turning TLO dataset into a dataframe

leg = pd.DataFrame(leg, columns = ["TLO_id","last_name","first_name", "filerHoldOfficeCd"])

In [4]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
leg

Unnamed: 0,TLO_id,last_name,first_name,filerHoldOfficeCd
0,A2100,Allen,,STATEREP
1,A2125,Alonzo,,STATEREP
2,A2135,Alvarado,,STATEREP
3,A2150,Anchia,,STATEREP
4,A2155,Anderson,"Charles ""Doc""",STATEREP
5,A2215,Anderson,Rodney,STATEREP
6,A3555,Arévalo,,STATEREP
7,A2330,Ashby,,STATEREP
8,A3200,Bailes,,STATEREP
9,A2335,Bell,,STATEREP


In [5]:
# splitting off nicknames and suffixes in the data from TLO

def parsenickname(first):
    nicknamePattern = '[\"\(](\w+)[\"\)]$'
    try:
        match = re.search(nicknamePattern, first)
        newFirst = first.split(match.group(0))[0].strip()
        return newFirst, match.group(1)
    except:
        return first, np.nan

"""   
def parsesuffix(name):
    suffixPattern = "(Jr\.|Sr\.|III)"
    if re.match(suffixPattern, name):
        match = re.search(suffixPattern, name)
        newName = name.strip(match.group(1))
        return newName.strip(), match.group(1)
    else:
        return name, np.nan
""" 

def parsesuffix(row):
    suffixPattern = "(Jr\.|Sr\.|III)"
    answers = [row["first_name"], row["last_name"], np.nan]
    if re.search(suffixPattern, row["last_name"]):
        match = re.search(suffixPattern, row["last_name"])
        newLast = row["last_name"].strip(match.group(1))
        answers = [row["first_name"], newLast.strip(), match.group(1)]
    elif type(row["first_name"]) == str and re.search(suffixPattern, row["first_name"]):
        match = re.search(suffixPattern, row["first_name"])
        newName = row["first_name"].strip(match.group(1))
        answers = [newName.strip(), row["last_name"], match.group(1)]
    if type(answers[0]) == str and len(answers[0]) == 0:
        answers[0] = np.nan
    return answers[0], answers[1], answers[2]

leg["first_name"], leg["filerNameShort"] = zip(*leg["first_name"].map(parsenickname))
leg["first_name"], leg["last_name"], leg["suffixes"] = zip(*leg.apply(parsesuffix, axis = 1))

leg

Unnamed: 0,TLO_id,last_name,first_name,filerHoldOfficeCd,filerNameShort,suffixes
0,A2100,Allen,,STATEREP,,
1,A2125,Alonzo,,STATEREP,,
2,A2135,Alvarado,,STATEREP,,
3,A2150,Anchia,,STATEREP,,
4,A2155,Anderson,Charles,STATEREP,Doc,
5,A2215,Anderson,Rodney,STATEREP,,
6,A3555,Arévalo,,STATEREP,,
7,A2330,Ashby,,STATEREP,,
8,A3200,Bailes,,STATEREP,,
9,A2335,Bell,,STATEREP,,


In [6]:
# This loads a file from an Open States bulk download
# Not sure that {"district": object} is the right choice, but I wanted integers with NaN allowed.

openstates = pd.read_csv("../data/inputs/openstates/2017-06-02-tx-csv/tx_legislators.csv", index_col = "leg_id",
                        dtype = {"district": object})

openstates["first_name"], openstates["filerNameShort"] = zip(*openstates["first_name"].map(parsenickname))

In [7]:
# some of the openstates rows are easy to link because they include the TLO id as part of the URL 
# to the legislator's photo.

import re

def parseTLO(url):
    urlPattern = "small\/(A[0-9]{3,5})\.jpg$"
    try:
        match = re.search(urlPattern, url)
        return match.group(1)
    except:
        return np.nan
    
openstates["TLO_id"] = (openstates["photo_url"].map(parseTLO))

In [8]:
# Finding the senators whose photo file names didn't include their TLO id.

openstates[openstates["TLO_id"].isnull()][openstates["chamber"] == "upper"]

  app.launch_new_instance()


Unnamed: 0_level_0,full_name,first_name,middle_name,last_name,suffixes,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort,TLO_id
leg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TXL000198,"Eddie Lucio, Jr.",Eduardo A.,,Lucio,Jr.,,True,tx,upper,27,Democrat,http://www.house.state.tx.us/photos/members/36...,2010-06-19 03:51:42.083000,2017-06-01 10:38:26.283000,Eddie,
TXL000312,José Menéndez,José,,Menéndez,,,True,tx,upper,26,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.467000,2017-06-01 10:38:26.641000,,
TXL000705,Dawn Buckingham,Dawn,,Buckingham,,,True,tx,upper,24,Republican,,2017-01-18 03:03:18.301000,2017-06-01 10:38:26.639000,,
TXL000707,Borris L. Miles,Borris L.,,Miles,,,True,tx,upper,13,Democrat,,2017-01-18 03:03:18.403000,2017-06-01 10:38:26.414000,,
TXL000716,Bryan Hughes,Bryan,,Hughes,,,True,tx,upper,1,Republican,,2017-01-27 05:42:08.204000,2017-06-01 10:38:26.497000,,


In [9]:
# just setting the remaining 5 senators' TLO_id directly.

tloID = {"TXL000198": "A1300", 
        "TXL000312": "A1110",
        "TXL000705": "A1125",
        "TXL000707": "A1115",
        "TXL000716": "A1135"}

for k in tloID.keys():
    openstates.set_value(k, "TLO_id", tloID[k])


In [10]:
openstates[openstates["chamber"] == "upper"]

Unnamed: 0_level_0,full_name,first_name,middle_name,last_name,suffixes,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort,TLO_id
leg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TXL000190,Craig Estes,Craig,,Estes,,,True,tx,upper,30,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.063000,2017-06-01 10:38:26.268000,,A1180
TXL000195,Juan Hinojosa,Juan,,Hinojosa,,,True,tx,upper,20,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.076000,2017-06-01 10:38:26.808000,Chuy,A1250
TXL000196,Joan Huffman,Joan,,Huffman,,,True,tx,upper,17,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.078000,2017-06-01 10:38:26.752000,,A1260
TXL000198,"Eddie Lucio, Jr.",Eduardo A.,,Lucio,Jr.,,True,tx,upper,27,Democrat,http://www.house.state.tx.us/photos/members/36...,2010-06-19 03:51:42.083000,2017-06-01 10:38:26.283000,Eddie,A1300
TXL000199,Jane Nelson,Jane,,Nelson,,,True,tx,upper,12,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.086000,2017-06-01 10:38:26.740000,,A1450
TXL000200,Robert Nichols,Robert,,Nichols,,,True,tx,upper,3,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.088000,2017-06-01 10:38:26.798000,,A1400
TXL000203,Kel Seliger,Kel,,Seliger,,,True,tx,upper,31,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.095000,2017-06-01 10:38:26.762000,,A1460
TXL000206,Carlos Uresti,Carlos I.,,Uresti,,,True,tx,upper,19,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.103000,2017-06-01 10:38:26.767000,,A1605
TXL000208,Kirk Watson,Kirk,,Watson,,,True,tx,upper,14,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.107000,2017-06-01 10:38:26.769000,,A1610
TXL000210,Royce West,Royce,,West,,,True,tx,upper,23,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.112000,2017-06-01 10:38:26.772000,,A1625


In [11]:
# Trying to move Jr. to the suffix field when it occurs at the beginning of the first_name field.
# And not to overwrite suffix fields when there's no suffix in the name field.
# Probably need to use map command to take in the contents of both column and then write the contents of both.

# AFAIK, the only row affected by this is TXL000509

def parsefirstsuffix(row):
    suffixPattern = "^(Jr\.|Sr\.|III),"
    if re.match(suffixPattern, row['first_name']):
        match = re.search(suffixPattern, row['first_name'])
        newFirst = row['first_name'].split(match.group(0))[1]
        return newFirst, match.group(1)
    else:
        return row['first_name'], row['suffixes']

openstates["first_name"], openstates["suffixes"] = zip(*openstates.apply(parsefirstsuffix, axis=1))

In [12]:
# This will overwrite any existing middle name if the first name contains a space

def parsefirstmiddle(row):
    try: middle = row['middle_name']
    except: middle = np.nan
    if type(row['first_name']) == str and len(row['first_name'].split()) > 1:
        return row['first_name'].split()[0], row['first_name'].split()[1]
    else:
        return row['first_name'], middle

openstates["first_name"], openstates["middle_name"] = zip(*openstates.apply(parsefirstmiddle, axis=1))
leg["first_name"], leg["middle_name"] = zip(*leg.apply(parsefirstmiddle, axis=1))    

In [13]:
leg

Unnamed: 0,TLO_id,last_name,first_name,filerHoldOfficeCd,filerNameShort,suffixes,middle_name
0,A2100,Allen,,STATEREP,,,
1,A2125,Alonzo,,STATEREP,,,
2,A2135,Alvarado,,STATEREP,,,
3,A2150,Anchia,,STATEREP,,,
4,A2155,Anderson,Charles,STATEREP,Doc,,
5,A2215,Anderson,Rodney,STATEREP,,,
6,A3555,Arévalo,,STATEREP,,,
7,A2330,Ashby,,STATEREP,,,
8,A3200,Bailes,,STATEREP,,,
9,A2335,Bell,,STATEREP,,,


In [14]:
# affects TXL000414

openstates["last_name"] = openstates["last_name"].apply(substitute)
openstates["full_name"] = openstates["full_name"].apply(substitute)

In [15]:
openSenate = openstates[openstates["chamber"] == "upper"]
legSenate = leg[leg["filerHoldOfficeCd"] == "STATESEN"]

In [16]:
senate = pd.merge(legSenate, openSenate.reset_index(), how='left', on='TLO_id')

In [17]:
openHouse = openstates[openstates["chamber"] == "lower"]
legHouse = leg[leg["filerHoldOfficeCd"] == "STATEREP"]

In [18]:
legHouse = legHouse.fillna(np.nan)

In [19]:
legLastOnly = leg[leg["first_name"].isnull()]

In [20]:
openLastOnly = pd.merge(legLastOnly, openHouse.reset_index(), how='left', on=['last_name'])

In [21]:
indexed_df = openLastOnly.set_index('TLO_id_x')

In [22]:
indexed_df[:5]

Unnamed: 0_level_0,last_name,first_name_x,filerHoldOfficeCd,filerNameShort_x,suffixes_x,middle_name_x,leg_id,full_name,first_name_y,middle_name_y,suffixes_y,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort_y,TLO_id_y
TLO_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A2100,Allen,,STATEREP,,,,TXL000214,Alma Allen,Alma,A.,,,True,tx,lower,131,Democratic,http://www.house.state.tx.us/photos/members/21...,2010-06-19 03:51:42.133000,2017-06-01 10:38:26.476000,,
A2125,Alonzo,,STATEREP,,,,TXL000215,Roberto Alonzo,Roberto,R.,,,True,tx,lower,104,Democratic,http://www.house.state.tx.us/photos/members/21...,2010-06-19 03:51:42.135000,2017-06-01 10:38:26.825000,,
A2135,Alvarado,,STATEREP,,,,TXL000216,Carol Alvarado,Carol,,,,True,tx,lower,145,Democratic,http://www.house.state.tx.us/photos/members/21...,2010-06-19 03:51:42.137000,2017-06-01 10:38:25.930000,,
A2150,Anchia,,STATEREP,,,,TXL000217,Rafael Anchia,Rafael,,,,True,tx,lower,103,Democratic,http://www.house.state.tx.us/photos/members/21...,2010-06-19 03:51:42.140000,2017-06-01 10:38:26.780000,,
A3555,Arévalo,,STATEREP,,,,TXL000683,"Arévalo, Diana",Diana,,,,True,tx,lower,116,Democratic,http://www.house.state.tx.us/photos/members/35...,2017-01-07 05:53:03.970000,2017-06-01 10:38:26.361000,,


In [23]:
legHouse = legHouse.set_index("TLO_id")

In [24]:
legHouse["first_name"] = legHouse["first_name"].fillna(indexed_df["first_name_y"])

In [25]:
house = pd.merge(legHouse.reset_index(), openHouse.reset_index(), how='left', on=['first_name', 'last_name'])

In [26]:
senate[:1]

Unnamed: 0,TLO_id,last_name_x,first_name_x,filerHoldOfficeCd,filerNameShort_x,suffixes_x,middle_name_x,leg_id,full_name,first_name_y,middle_name_y,last_name_y,suffixes_y,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort_y
0,A1055,Bettencourt,,STATESEN,,,,TXL000482,Paul Bettencourt,Paul,,Bettencourt,,,True,tx,upper,7,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2015-01-14 00:53:27.393000,2017-06-01 10:38:26.745000,


In [27]:
senate = senate.rename(index=str, columns={"last_name_x": "last_name", "first_name_y": "first_name"})
senate = senate.drop(['first_name_x', 'last_name_y'], axis=1)

In [28]:
house[:1]

Unnamed: 0,TLO_id_x,last_name,first_name,filerHoldOfficeCd,filerNameShort_x,suffixes_x,middle_name_x,leg_id,full_name,middle_name_y,suffixes_y,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort_y,TLO_id_y
0,A2100,Allen,Alma,STATEREP,,,,TXL000214,Alma Allen,A.,,,True,tx,lower,131,Democratic,http://www.house.state.tx.us/photos/members/21...,2010-06-19 03:51:42.133000,2017-06-01 10:38:26.476000,,


In [29]:
house = house.rename(index=str, columns={"TLO_id_x": "TLO_id"})
house = house.drop(['TLO_id_y'], axis=1)

In [30]:
# openTLOouter = pd.merge(legHouse.reset_index(), openHouse.reset_index(), how='outer', on=['first_name', 'last_name'])

In [31]:
senate

Unnamed: 0,TLO_id,last_name,filerHoldOfficeCd,filerNameShort_x,suffixes_x,middle_name_x,leg_id,full_name,first_name,middle_name_y,suffixes_y,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort_y
0,A1055,Bettencourt,STATESEN,,,,TXL000482,Paul Bettencourt,Paul,,,,True,tx,upper,7.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2015-01-14 00:53:27.393000,2017-06-01 10:38:26.745000,
1,A1080,Birdwell,STATESEN,,,,TXL000364,Brian Birdwell,Brian,,,,True,tx,upper,22.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-07-17 08:38:47.948000,2017-06-01 10:38:26.747000,
2,A1125,Buckingham,STATESEN,,,,TXL000705,Dawn Buckingham,Dawn,,,,True,tx,upper,24.0,Republican,,2017-01-18 03:03:18.301000,2017-06-01 10:38:26.639000,
3,A1090,Burton,STATESEN,,,,TXL000487,Konni Burton,Konni,,,,True,tx,upper,10.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2015-01-14 00:53:27.536000,2017-06-01 10:38:26.492000,
4,A1010,Campbell,STATESEN,,,,TXL000419,Donna Campbell,Donna,,,,True,tx,upper,25.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2013-01-09 00:29:35.053000,2017-06-01 10:38:26.288000,
5,A1040,Creighton,STATESEN,,,,TXL000240,Brandon Creighton,Brandon,,,,True,tx,upper,4.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.210000,2017-06-01 10:38:26.803000,
6,A1180,Estes,STATESEN,,,,TXL000190,Craig Estes,Craig,,,,True,tx,upper,30.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.063000,2017-06-01 10:38:26.268000,
7,A1035,Garcia,STATESEN,,,,TXL000473,Sylvia R. Garcia,Sylvia,R.,,,True,tx,upper,6.0,Democrat,http://www.legdir.legis.state.tx.us/FlashCardD...,2013-03-12 18:18:45.435000,2017-06-01 10:35:57.613000,
8,A1050,Hall,STATESEN,,,,TXL000503,Bob Hall,Bob,,,,True,tx,upper,2.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2015-01-14 00:53:27.965000,2017-06-01 10:38:26.801000,
9,A1015,Hancock,STATESEN,,,,TXL000271,Kelly Hancock,Kelly,,,,True,tx,upper,9.0,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2010-06-19 03:51:42.331000,2017-06-01 10:38:26.805000,


In [32]:
# senate = tloOpen[tloOpen["ctaSeekOfficeCd"] == "STATESEN"]

In [33]:
# This loads a file from the Texas Election Commission

tx = pd.read_csv("../data/inputs/TEC_CF_CSV/filers.csv", dtype = {"filerIdent": int}, 
                 parse_dates = ['filerEffStartDt','filerEffStopDt', 'treasEffStartDt', 'treasEffStopDt'])

  interactivity=interactivity, compiler=compiler, result=result)


In [34]:
tx = tx[tx["filerTypeCd"] == 'COH']
tx = tx.rename(columns={"filerHoldOfficeDistrict": "district"})

In [35]:
# removed 'STATEREP',  to focus on Senate
# removed [tx['ctaSeekOfficeCd'].isin(['STATESEN'], changed to filerHoldOfficeCd 

txSenate = tx[tx['filerHoldOfficeCd'].isin(['STATESEN'])][tx['filerFilerpersStatusCd'] != "NOT_OFFICEHOLDER"]
txHouse = tx[tx['filerHoldOfficeCd'].isin(['STATEREP'])][tx['filerFilerpersStatusCd'] != "NOT_OFFICEHOLDER"]



In [36]:
txSenate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31 entries, 40 to 13012
Columns: 132 entries, recordType to chairPrimaryPhoneExt
dtypes: datetime64[ns](4), float64(24), int64(1), object(103)
memory usage: 32.2+ KB


In [37]:
txHouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1757 to 14069
Columns: 132 entries, recordType to chairPrimaryPhoneExt
dtypes: datetime64[ns](4), float64(24), int64(1), object(103)
memory usage: 155.9+ KB


In [38]:
senateAll = pd.merge(senate, txSenate, how='left', on = 'district')
houseAll = pd.merge(house, txHouse, how='left', on = 'district')

In [39]:
houseAll[:1]

Unnamed: 0,TLO_id,last_name,first_name,filerHoldOfficeCd_x,filerNameShort_x,suffixes_x,middle_name_x,leg_id,full_name,middle_name_y,suffixes_y,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort_y,recordType,filerIdent,filerTypeCd,filerName,unexpendContribFilerFlag,modifiedElectCycleFlag,filerJdiCd,committeeStatusCd,ctaSeekOfficeCd,ctaSeekOfficeDistrict,ctaSeekOfficePlace,ctaSeekOfficeDescr,ctaSeekOfficeCountyCd,ctaSeekOfficeCountyDescr,filerPersentTypeCd,filerNameOrganization,filerNameLast,filerNameSuffixCd,filerNameFirst,filerNamePrefixCd,filerNameShort,filerStreetAddr1,filerStreetAddr2,filerStreetCity,filerStreetStateCd,filerStreetCountyCd,filerStreetCountryCd,filerStreetPostalCode,filerStreetRegion,filerMailingAddr1,filerMailingAddr2,filerMailingCity,filerMailingStateCd,filerMailingCountyCd,filerMailingCountryCd,filerMailingPostalCode,filerMailingRegion,filerPrimaryUsaPhoneFlag,filerPrimaryPhoneNumber,filerPrimaryPhoneExt,filerHoldOfficeCd_y,filerHoldOfficePlace,filerHoldOfficeDescr,filerHoldOfficeCountyCd,filerHoldOfficeCountyDescr,filerFilerpersStatusCd,filerEffStartDt,filerEffStopDt,contestSeekOfficeCd,contestSeekOfficeDistrict,contestSeekOfficePlace,contestSeekOfficeDescr,contestSeekOfficeCountyCd,contestSeekOfficeCountyDescr,treasPersentTypeCd,treasNameOrganization,treasNameLast,treasNameSuffixCd,treasNameFirst,treasNamePrefixCd,treasNameShort,treasStreetAddr1,treasStreetAddr2,treasStreetCity,treasStreetStateCd,treasStreetCountyCd,treasStreetCountryCd,treasStreetPostalCode,treasStreetRegion,treasMailingAddr1,treasMailingAddr2,treasMailingCity,treasMailingStateCd,treasMailingCountyCd,treasMailingCountryCd,treasMailingPostalCode,treasMailingRegion,treasPrimaryUsaPhoneFlag,treasPrimaryPhoneNumber,treasPrimaryPhoneExt,treasAppointorNameLast,treasAppointorNameFirst,treasFilerpersStatusCd,treasEffStartDt,treasEffStopDt,assttreasPersentTypeCd,assttreasNameOrganization,assttreasNameLast,assttreasNameSuffixCd,assttreasNameFirst,assttreasNamePrefixCd,assttreasNameShort,assttreasStreetAddr1,assttreasStreetAddr2,assttreasStreetCity,assttreasStreetStateCd,assttreasStreetCountyCd,assttreasStreetCountryCd,assttreasStreetPostalCode,assttreasStreetRegion,assttreasPrimaryUsaPhoneFlag,assttreasPrimaryPhoneNumber,assttreasPrimaryPhoneExt,assttreasAppointorNameLast,assttreasAppointorNameFirst,chairPersentTypeCd,chairNameOrganization,chairNameLast,chairNameSuffixCd,chairNameFirst,chairNamePrefixCd,chairNameShort,chairStreetAddr1,chairStreetAddr2,chairStreetCity,chairStreetStateCd,chairStreetCountyCd,chairStreetCountryCd,chairStreetPostalCode,chairStreetRegion,chairMailingAddr1,chairMailingAddr2,chairMailingCity,chairMailingStateCd,chairMailingCountyCd,chairMailingCountryCd,chairMailingPostalCode,chairMailingRegion,chairPrimaryUsaPhoneFlag,chairPrimaryPhoneNumber,chairPrimaryPhoneExt
0,A2100,Allen,Alma,STATEREP,,,,TXL000214,Alma Allen,A.,,,True,tx,lower,131,Democratic,http://www.house.state.tx.us/photos/members/21...,2010-06-19 03:51:42.133000,2017-06-01 10:38:26.476000,,FILER,19673,COH,"Allen, Alma A. (The Honorable)",N,N,,,STATEREP,131,,,,,INDIVIDUAL,,Allen,,Alma A.,HONORABLE,,"3401 Louisiana St., Ste. 250",,Houston,TX,,USA,77002-9546,,10709 Marsha Ln.,,Houston,TX,,USA,77024,,Y,2816423426,,STATEREP,,,,,CURRENT_OFFICEHOLDER,2005-01-01,NaT,STATEREP,131,,,,,INDIVIDUAL,,Jackson,,Alfred,MR,,10709 Marsha Ln.,,Houston,TX,,USA,77024,,"7670 Woodway, Ste. 110",,Houston,TX,,USA,77063,,Y,7139080000.0,,,,CURRENT_OFFICEHOLDER,2005-01-01,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [40]:
houseAll[["last_name", "full_name", "filerName", "filerIdent"]][:5]

Unnamed: 0,last_name,full_name,filerName,filerIdent
0,Allen,Alma Allen,"Allen, Alma A. (The Honorable)",19673
1,Alonzo,Roberto Alonzo,"Alonzo, Roberto R. (The Honorable)",19681
2,Alvarado,Carol Alvarado,"Alvarado, Carol (The Honorable)",24376
3,Anchia,Rafael Anchia,"Anchia, Rafael M. (The Honorable)",54808
4,Anderson,Charles Anderson,"Anderson, Charles (The Honorable)",51449


In [41]:
senateAll[["last_name", "full_name", "filerName", "filerIdent"]][:5]

Unnamed: 0,last_name,full_name,filerName,filerIdent
0,Bettencourt,Paul Bettencourt,"Bettencourt, Paul (The Honorable)",69337.0
1,Birdwell,Brian Birdwell,"Birdwell, Brian D. (The Honorable)",62137.0
2,Buckingham,Dawn Buckingham,"Buckingham, Dawn C. (The Honorable)",69001.0
3,Burton,Konni Burton,"Burton, Konni L. (The Honorable)",69199.0
4,Campbell,Donna Campbell,"Campbell M.D., Donna (The Honorable)",67809.0


In [42]:
senateAll["filerIdent"] = senateAll["filerIdent"].astype(object)

In [47]:
senateAll[:1]

Unnamed: 0_level_0,last_name,filerHoldOfficeCd_x,filerNameShort_x,suffixes_x,middle_name_x,leg_id,full_name,first_name,middle_name_y,suffixes_y,nickname,active,state,chamber,district,party,photo_url,created_at,updated_at,filerNameShort_y,recordType,filerIdent,filerTypeCd,filerName,unexpendContribFilerFlag,modifiedElectCycleFlag,filerJdiCd,committeeStatusCd,ctaSeekOfficeCd,ctaSeekOfficeDistrict,ctaSeekOfficePlace,ctaSeekOfficeDescr,ctaSeekOfficeCountyCd,ctaSeekOfficeCountyDescr,filerPersentTypeCd,filerNameOrganization,filerNameLast,filerNameSuffixCd,filerNameFirst,filerNamePrefixCd,filerNameShort,filerStreetAddr1,filerStreetAddr2,filerStreetCity,filerStreetStateCd,filerStreetCountyCd,filerStreetCountryCd,filerStreetPostalCode,filerStreetRegion,filerMailingAddr1,filerMailingAddr2,filerMailingCity,filerMailingStateCd,filerMailingCountyCd,filerMailingCountryCd,filerMailingPostalCode,filerMailingRegion,filerPrimaryUsaPhoneFlag,filerPrimaryPhoneNumber,filerPrimaryPhoneExt,filerHoldOfficeCd_y,filerHoldOfficePlace,filerHoldOfficeDescr,filerHoldOfficeCountyCd,filerHoldOfficeCountyDescr,filerFilerpersStatusCd,filerEffStartDt,filerEffStopDt,contestSeekOfficeCd,contestSeekOfficeDistrict,contestSeekOfficePlace,contestSeekOfficeDescr,contestSeekOfficeCountyCd,contestSeekOfficeCountyDescr,treasPersentTypeCd,treasNameOrganization,treasNameLast,treasNameSuffixCd,treasNameFirst,treasNamePrefixCd,treasNameShort,treasStreetAddr1,treasStreetAddr2,treasStreetCity,treasStreetStateCd,treasStreetCountyCd,treasStreetCountryCd,treasStreetPostalCode,treasStreetRegion,treasMailingAddr1,treasMailingAddr2,treasMailingCity,treasMailingStateCd,treasMailingCountyCd,treasMailingCountryCd,treasMailingPostalCode,treasMailingRegion,treasPrimaryUsaPhoneFlag,treasPrimaryPhoneNumber,treasPrimaryPhoneExt,treasAppointorNameLast,treasAppointorNameFirst,treasFilerpersStatusCd,treasEffStartDt,treasEffStopDt,assttreasPersentTypeCd,assttreasNameOrganization,assttreasNameLast,assttreasNameSuffixCd,assttreasNameFirst,assttreasNamePrefixCd,assttreasNameShort,assttreasStreetAddr1,assttreasStreetAddr2,assttreasStreetCity,assttreasStreetStateCd,assttreasStreetCountyCd,assttreasStreetCountryCd,assttreasStreetPostalCode,assttreasStreetRegion,assttreasPrimaryUsaPhoneFlag,assttreasPrimaryPhoneNumber,assttreasPrimaryPhoneExt,assttreasAppointorNameLast,assttreasAppointorNameFirst,chairPersentTypeCd,chairNameOrganization,chairNameLast,chairNameSuffixCd,chairNameFirst,chairNamePrefixCd,chairNameShort,chairStreetAddr1,chairStreetAddr2,chairStreetCity,chairStreetStateCd,chairStreetCountyCd,chairStreetCountryCd,chairStreetPostalCode,chairStreetRegion,chairMailingAddr1,chairMailingAddr2,chairMailingCity,chairMailingStateCd,chairMailingCountyCd,chairMailingCountryCd,chairMailingPostalCode,chairMailingRegion,chairPrimaryUsaPhoneFlag,chairPrimaryPhoneNumber,chairPrimaryPhoneExt
TLO_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1
A1055,Bettencourt,STATESEN,,,,TXL000482,Paul Bettencourt,Paul,,,,True,tx,upper,7,Republican,http://www.legdir.legis.state.tx.us/FlashCardD...,2015-01-14 00:53:27.393000,2017-06-01 10:38:26.745000,,FILER,69337,COH,"Bettencourt, Paul (The Honorable)",N,N,,,STATESEN,7,,,,,INDIVIDUAL,,Bettencourt,,Paul,HONORABLE,,"1 E. Greenway Plz., Ste. 225",,Houston,TX,,USA,77046,,"1 E. Greenway Plz., Ste. 225",,Houston,TX,,USA,77046,,Y,7135260000.0,,STATESEN,,,,,CURRENT_OFFICEHOLDER,2015-01-01,NaT,,,,,,,INDIVIDUAL,,Boylan,,Michael,MR,,"1 E. Greenway Plz., Ste. 225",,Houston,TX,,USA,77046,,"1 E. Greenway Plz., Ste. 225",,Houston,TX,,USA,77046,,Y,7135260000.0,,,,CURRENT_OFFICEHOLDER,2015-01-01,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [44]:
senateAll = senateAll.set_index('TLO_id')
houseAll = houseAll.set_index('TLO_id')

In [45]:
# These have a lot of duplicative columns that will only be useful for error checking.

senateAll.to_csv("../data/senateCrosswalk.csv")
houseAll.to_csv("../data/houseCrosswalk.csv")

In [48]:
# These extra redundant files have just the ID numbers and nothing else.

senateAll[["leg_id", "filerIdent"]].to_csv("../data/senateIDs.csv")
houseAll[["leg_id", "filerIdent"]].to_csv("../data/houseIDs.csv")