In [1]:
import pandas as pd
from datetime import datetime
import re

In [107]:
house = pd.read_csv("./loren_data/house_2022-07-15.csv")
house.head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
0,2021,10/04/2021,2021-09-27,joint,BP,BP plc,purchase,"$1,001 - $15,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False
1,2021,10/04/2021,2021-09-13,joint,XOM,Exxon Mobil Corporation,purchase,"$1,001 - $15,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False
2,2021,10/04/2021,2021-09-10,joint,ILPT,Industrial Logistics Properties Trust - Common...,purchase,"$15,001 - $50,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False
3,2021,10/04/2021,2021-09-28,joint,PM,Phillip Morris International Inc,purchase,"$15,001 - $50,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False
4,2021,10/04/2021,2021-09-17,self,BLK,BlackRock Inc,sale_partial,"$1,001 - $15,000",Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_dis...,False


In [108]:
house.dtypes

disclosure_year            int64
disclosure_date           object
transaction_date          object
owner                     object
ticker                    object
asset_description         object
type                      object
amount                    object
representative            object
district                  object
ptr_link                  object
cap_gains_over_200_usd      bool
dtype: object

In [109]:
house.isnull().sum()

disclosure_year              0
disclosure_date              0
transaction_date             0
owner                     5614
ticker                       0
asset_description            4
type                         0
amount                       0
representative               0
district                     0
ptr_link                     0
cap_gains_over_200_usd       0
dtype: int64

In [110]:
house[house['owner'].isnull()].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
6,2021,12/01/2021,2021-11-30,,KPLTW,Katapult Holdings Inc - Warrant,purchase,"$1,001 - $15,000",Hon. Austin Scott,GA08,https://disclosures-clerk.house.gov/public_dis...,False
7,2021,12/01/2021,2021-11-18,,AMD,Advanced Micro Devices Inc,sale_full,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_dis...,False
8,2021,12/01/2021,2021-11-18,,AAPL,Apple Inc,sale_full,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_dis...,False
9,2021,12/01/2021,2021-11-24,,MSFT,Microsoft Corporation,purchase,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_dis...,False
10,2021,12/01/2021,2021-11-24,,MS,Morgan Stanley,purchase,"$100,001 - $250,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_dis...,False


In [111]:
percent_null = house.owner.isnull().sum()/len(house)
percent_null

0.3779198922921575

In [112]:
house['owner'].value_counts(dropna = False)

NaN          5614
joint        4635
self         2897
--           1315
dependent     394
Name: owner, dtype: int64

We decided to drop the `'owner'` column because there were so many nulls and '--'. For our purposes, to see if congresspeople are taking advantage of insider trading, the distinction of who in the congressperson's family technically owns the equity is unimportant. It would be interesting to explore ownership, but there's just too many nulls.

In [113]:
house.drop(columns = ['owner'], inplace = True)

In [114]:
house['disclosure_date'] = pd.to_datetime(house['disclosure_date'], yearfirst=True)

We discovered some wonky years in the transaction dates. Fortunately, it was highly likely the disclosure year was the year of the transaction, so we substituted that for the year

This [stackoverflow answer](https://stackoverflow.com/a/56968849) helped to understand how to pull the first item from within the list.

In [115]:
house['transaction_date'] = house['transaction_date'].str.split('-')

weird_years = house[(house['transaction_date'].str[0] != '2017') &
      (house['transaction_date'].str[0] != '2018') &
      (house['transaction_date'].str[0] != '2019') &
      (house['transaction_date'].str[0] != '2020') &
      (house['transaction_date'].str[0] != '2021') &
      (house['transaction_date'].str[0] != '2022')]

for i in weird_years.index:
    house.loc[i, 'transaction_date'][0] = str(house.loc[i, 'disclosure_year'])

In [116]:
#verifying the years have been fixed
house[(house['transaction_date'].str[0] != '2017') &
      (house['transaction_date'].str[0] != '2018') &
      (house['transaction_date'].str[0] != '2019') &
      (house['transaction_date'].str[0] != '2020') &
      (house['transaction_date'].str[0] != '2021') &
      (house['transaction_date'].str[0] != '2022')]

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd


In [117]:
house['transaction_date'] = house['transaction_date'].str.join('-')
house['transaction_date'] = pd.to_datetime(house['transaction_date'])
house.dtypes

disclosure_year                    int64
disclosure_date           datetime64[ns]
transaction_date          datetime64[ns]
ticker                            object
asset_description                 object
type                              object
amount                            object
representative                    object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object

In [118]:
#verifying the range of dates is what we expect
house['transaction_date'].describe(datetime_is_numeric=True)

count                            14855
mean     2020-11-29 17:24:47.256815616
min                2017-09-05 00:00:00
25%                2020-04-22 00:00:00
50%                2020-11-19 00:00:00
75%                2021-06-08 00:00:00
max                2022-12-31 00:00:00
Name: transaction_date, dtype: object

In [119]:
house['amount'].value_counts()

$1,001 - $15,000            10447
$15,001 - $50,000            2362
$50,001 - $100,000            750
$100,001 - $250,000           572
$250,001 - $500,000           243
$1,001 -                      242
$500,001 - $1,000,000         150
$1,000,001 - $5,000,000        41
$1,000,000 +                   30
$5,000,001 - $25,000,000        9
$1,000 - $15,000                4
$15,000 - $50,000               3
$50,000,000 +                   1
$1,000,000 - $5,000,000         1
Name: amount, dtype: int64

Review of .replace() from [here](https://www.symbiosisacademy.org/tutorial-index/pandas-search-replace-values-columns/)

In [120]:
house = house.replace(to_replace={'amount':['$1,001 -', '$1,000 - $15,000']}, value= '$1,001 - $15,000')
house = house.replace(to_replace={'amount':['$1,000,000 +', '$1,000,000 - $5,000,000']}, value= '$1,000,001 - $5,000,000')
house = house.replace(to_replace={'amount':['$15,000 - $50,000']}, value= '$15,001 - $50,000')
house['amount'].value_counts()

$1,001 - $15,000            10693
$15,001 - $50,000            2365
$50,001 - $100,000            750
$100,001 - $250,000           572
$250,001 - $500,000           243
$500,001 - $1,000,000         150
$1,000,001 - $5,000,000        72
$5,000,001 - $25,000,000        9
$50,000,000 +                   1
Name: amount, dtype: int64

In [121]:
house['amount'] = house['amount'].str.split(' ') 

for i in range(0, len(house)):
    if len(house.loc[i, 'amount']) == 3:
        house.loc[i, 'amount'] = house.loc[i, 'amount'][2]
    else:
        house.loc[i, 'amount'] = house.loc[i, 'amount'][0]
        
house['amount'] = house['amount'].map(lambda x: x.replace('$', '').replace(',', ''))
house['amount'] = house['amount'].astype(int)
house['amount'].value_counts()

15000       10693
50000        2365
100000        750
250000        572
500000        243
1000000       150
5000000        72
25000000        9
50000000        1
Name: amount, dtype: int64

In [122]:
house.dtypes

disclosure_year                    int64
disclosure_date           datetime64[ns]
transaction_date          datetime64[ns]
ticker                            object
asset_description                 object
type                              object
amount                             int64
representative                    object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object

In [123]:
house['ticker'].value_counts()

--       1208
MSFT      244
AAPL      181
NTAP      130
TDDXX     122
         ... 
MSTR        1
MC          1
MPWR        1
HEI         1
FLCB        1
Name: ticker, Length: 2123, dtype: int64

In [124]:
house['ticker'].value_counts().values[0]/len(house)

0.08131942107034669

In [125]:
house[house['ticker'] == '--'].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
56,2020,2020-09-22,2020-08-17,--,Metallic Minerals Corp.,sale_partial,250000,Mr. TJ John (Tj) Cox,CA21,https://disclosures-clerk.house.gov/public_dis...,True
57,2021,2021-03-23,2021-01-27,--,Zimmer Biomet Holdings,sale_partial,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_dis...,False
58,2021,2021-03-23,2021-02-08,--,Zimmer Biomet Holdings,sale_full,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_dis...,False
59,2021,2021-03-23,2021-02-08,--,Zimmer Biomet Holdings,purchase,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_dis...,False
61,2021,2021-03-23,2021-02-19,--,Celegene Corp,sale_full,50000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_dis...,False


In [126]:
tickers = pd.read_excel('./loren_data/US-Stock-Symbols.xlsx')
tickers = tickers[['Symbol', 'Name']]
tickers.columns = tickers.columns.map(lambda x: x.lower())

tickers.head()

Unnamed: 0,symbol,name
0,AAPL,Apple Inc.
1,XOM,Exxon Mobil Corporation
2,MSFT,Microsoft Corporation
3,BAC^I,Bank of America Corporation
4,IBM,International Business Machines Corporation


In [127]:
for i in house.loc[house['ticker'] == '--']['asset_description']:
    symbol = tickers.loc[tickers['name'] == i, 'symbol']
    house.loc[house['asset_description'] == i, 'ticker'] = symbol

In [128]:
house.ticker.value_counts()

MSFT     244
AAPL     181
NTAP     130
TDDXX    122
FB       115
        ... 
MGPI       1
MSTR       1
MC         1
MPWR       1
FLCB       1
Name: ticker, Length: 2117, dtype: int64

In [129]:
house[house['ticker'] == '--']

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd


In [130]:
house.describe()

Unnamed: 0,disclosure_year,amount
count,14855.0,14855.0
mean,2020.634938,94456.08
std,0.703684,822743.8
min,2020.0,15000.0
25%,2020.0,15000.0
50%,2021.0,15000.0
75%,2021.0,50000.0
max,2022.0,50000000.0


We need our columns to agree with the Senate dataframe columns, so we made the following changes. While we erred on the side of keeping information for now, we dropped the 'ptr_link' because we don't anticipate doing any analysis on it. If we need it, we'll come back and reference it from the csv or from prior to this point.

This [stackoverflow answer](https://stackoverflow.com/a/54195568) helped us to find the differences between the lists of columns 

In [131]:
house_cols = list(house.columns)

senate_cols = ['transaction_date', 'owner', 'ticker', 'asset_description',
       'asset_type', 'type', 'amount', 'comment', 'senator', 'ptr_link',
       'disclosure_date', 'year'] 

house_different_cols = list(set(house_cols).difference(senate_cols))
print(f'In House, not in Senate: {house_different_cols}')

senate_different_cols = list(set(senate_cols).difference(house_cols))
print(f'In Senate, not in House: {senate_different_cols}')

In House, not in Senate: ['representative', 'district', 'disclosure_year', 'cap_gains_over_200_usd']
In Senate, not in House: ['senator', 'owner', 'asset_type', 'year', 'comment']


In [132]:
house['chamber'] = 'house'
house.drop(columns = ['ptr_link', 'cap_gains_over_200_usd', 'disclosure_year'], inplace = True)
house.rename(columns = {'representative': 'name', 'district': 'represents'}, inplace = True)

In [133]:
house_cols = list(house.columns)

senate_cols = ['transaction_date', 'owner', 'ticker', 'asset_description',
       'asset_type', 'type', 'amount', 'comment', 'senator', 'ptr_link',
       'disclosure_date', 'year'] 

house_different_cols = list(set(house_cols).difference(senate_cols))
print(f'In House, not in Senate: {house_different_cols}')

senate_different_cols = list(set(senate_cols).difference(house_cols))
print(f'In Senate, not in House: {senate_different_cols}')

In House, not in Senate: ['name', 'chamber', 'represents']
In Senate, not in House: ['senator', 'owner', 'ptr_link', 'asset_type', 'year', 'comment']


The following came from [this github](https://github.com/unitedstates/congress-legislators/).

In [134]:
#NOT IN .py FILE AS OF R2022-07-19 15:00

current_legislators = pd.read_csv('./loren_data/legislators-current.csv')
current_legislators.shape

(537, 34)

In [135]:
current_legislators.head()

Unnamed: 0,last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,...,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
0,Brown,Sherrod,,,,Sherrod Brown,1952-11-09,M,sen,OH,...,N00003535,S307,"H2OH13033,S6OH00163",5051.0,400050,27018.0,Sherrod Brown,,29389.0,Sherrod Brown
1,Cantwell,Maria,,,,Maria Cantwell,1958-10-13,F,sen,WA,...,N00007836,S275,"S8WA00194,H2WA01054",26137.0,300018,27122.0,Maria Cantwell,,39310.0,Maria Cantwell
2,Cardin,Benjamin,L.,,,Benjamin L. Cardin,1943-10-05,M,sen,MD,...,N00001955,S308,"H6MD03177,S6MD03177",4004.0,400064,26888.0,Ben Cardin,,15408.0,Ben Cardin
3,Carper,Thomas,Richard,,,Thomas R. Carper,1947-01-23,M,sen,DE,...,N00012508,S277,S8DE00079,663.0,300019,22421.0,Tom Carper,,15015.0,Tom Carper
4,Casey,Robert,P.,Jr.,Bob,"Robert P. Casey, Jr.",1960-04-13,M,sen,PA,...,N00027503,S309,S6PA00217,47036.0,412246,2541.0,"Bob Casey, Jr.",,40703.0,Bob Casey Jr.


In [136]:
current_leg_cols = list(current_legislators.columns)
current_leg_cols

['last_name',
 'first_name',
 'middle_name',
 'suffix',
 'nickname',
 'full_name',
 'birthday',
 'gender',
 'type',
 'state',
 'district',
 'senate_class',
 'party',
 'url',
 'address',
 'phone',
 'contact_form',
 'rss_url',
 'twitter',
 'facebook',
 'youtube',
 'youtube_id',
 'bioguide_id',
 'thomas_id',
 'opensecrets_id',
 'lis_id',
 'fec_ids',
 'cspan_id',
 'govtrack_id',
 'votesmart_id',
 'ballotpedia_id',
 'washington_post_id',
 'icpsr_id',
 'wikipedia_id']

In [137]:
historical_legislators = pd.read_csv('./loren_data/legislators-historical.csv')
historical_legislators.shape

(12056, 34)

In [138]:
historical_leg_cols = list(historical_legislators.columns)
historical_leg_cols

['last_name',
 'first_name',
 'middle_name',
 'suffix',
 'nickname',
 'full_name',
 'birthday',
 'gender',
 'type',
 'state',
 'district',
 'senate_class',
 'party',
 'url',
 'address',
 'phone',
 'contact_form',
 'rss_url',
 'twitter',
 'facebook',
 'youtube',
 'youtube_id',
 'bioguide_id',
 'thomas_id',
 'opensecrets_id',
 'lis_id',
 'fec_ids',
 'cspan_id',
 'govtrack_id',
 'votesmart_id',
 'ballotpedia_id',
 'washington_post_id',
 'icpsr_id',
 'wikipedia_id']

In [139]:
current_diff_cols = list(set(current_leg_cols).difference(historical_leg_cols))
current_diff_cols

[]

The above determines the columns are the same. We shrank the historical ones to eliminate irrelevant entries. Making birthdays datetime. Referenced [this site](https://www.w3schools.com/python/python_datetime.asp).

In [188]:
historical_legislators['birthday'] = pd.to_datetime(historical_legislators['birthday'], format = '%Y-%m-%d')

In [189]:
current_legislators['birthday'] = pd.to_datetime(current_legislators['birthday'], format = '%Y-%m-%d')

In [190]:
historical_legislators = historical_legislators[historical_legislators['birthday'] > datetime(1922,1, 1)]

In [191]:
relevant_legislators = pd.concat([current_legislators, historical_legislators])

In [192]:
relevant_legislators = relevant_legislators[['last_name', 'first_name', 'middle_name', 'suffix',
                                             'full_name', 'birthday', 'gender', 'type', 'state', 'party', 'district']]

In [193]:
relevant_legislators.dtypes

last_name              object
first_name             object
middle_name            object
suffix                 object
full_name              object
birthday       datetime64[ns]
gender                 object
type                   object
state                  object
party                  object
district              float64
dtype: object

In [194]:
relevant_legislators.head()

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
0,Brown,Sherrod,,,Sherrod Brown,1952-11-09,M,sen,OH,Democrat,
1,Cantwell,Maria,,,Maria Cantwell,1958-10-13,F,sen,WA,Democrat,
2,Cardin,Benjamin,L.,,Benjamin L. Cardin,1943-10-05,M,sen,MD,Democrat,
3,Carper,Thomas,Richard,,Thomas R. Carper,1947-01-23,M,sen,DE,Democrat,
4,Casey,Robert,P.,Jr.,"Robert P. Casey, Jr.",1960-04-13,M,sen,PA,Democrat,


In [146]:
house.head()

Unnamed: 0,disclosure_date,transaction_date,ticker,asset_description,type,amount,name,represents,chamber
0,2021-10-04,2021-09-27,BP,BP plc,purchase,15000,Hon. Virginia Foxx,NC05,house
1,2021-10-04,2021-09-13,XOM,Exxon Mobil Corporation,purchase,15000,Hon. Virginia Foxx,NC05,house
2,2021-10-04,2021-09-10,ILPT,Industrial Logistics Properties Trust - Common...,purchase,50000,Hon. Virginia Foxx,NC05,house
3,2021-10-04,2021-09-28,PM,Phillip Morris International Inc,purchase,50000,Hon. Virginia Foxx,NC05,house
4,2021-10-04,2021-09-17,BLK,BlackRock Inc,sale_partial,15000,Hon. Alan S. Lowenthal,CA47,house


In [147]:
relevant_legislators.tail()

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
12051,Young,Don,E.,,Don Young,1933-06-09,M,rep,AK,Republican,0.0
12052,Fortenberry,Jeff,Lane,,Jeff Fortenberry,1960-12-27,M,rep,NE,Republican,1.0
12053,Vela,Filemon,,,Filemon Vela,1963-02-13,M,rep,TX,Democrat,34.0
12054,Reed,Tom,W.,,Tom Reed,1971-11-18,M,rep,NY,Republican,23.0
12055,Delgado,Antonio,,,Antonio Delgado,1977-01-19,M,rep,NY,Democrat,19.0


In [148]:
house['name'] = house['name'].map(lambda x: x.replace("Hon. ", ""))

In [149]:
house['first_name'] = house['name'].map(lambda x: x.split()[0])

In [150]:
house['last_name'] = house['name'].map(lambda x: x.split()[-1])

[This](https://stackoverflow.com/a/20772805) is one of several places I read about a bug in .loc()

for i in house.index[:5]:
    first_name = house.loc[i, 'first_name']
    last_name = house.loc[i, 'last_name']
    first_df = relevant_legislators.loc[relevant_legislators['first_name'] == first_name]
    rep_df = first_df.loc[first_df['last_name'] == last_name]
    party = rep_df['party'].values
    #test_df = pd.DataFrame()
    #test_df['party'] = party
    print(type(party))

for i in range(0, len(house)):
    first_name = house.loc[i, 'first_name']
    last_name = house.loc[i, 'last_name']
    first_df = relevant_legislators[relevant_legislators['first_name'] == first_name]
    rep_df = first_df[first_df['last_name'] == last_name]
    party = rep_df['party']
    house.loc[i]['party'] = party
    #test_df = pd.DataFrame()
    #test_df['party'] = party
   # print(house.head())

In [151]:
relevant_legislators[relevant_legislators['last_name'] == 'McEachin']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
360,McEachin,A.,Donald,,A. Donald McEachin,1961-10-10,M,rep,VA,Democrat,4.0


In [152]:
house.iloc[5]

disclosure_date           2021-12-01 00:00:00
transaction_date          2021-12-01 00:00:00
ticker                                    AXP
asset_description    American Express Company
type                                 purchase
amount                                  15000
name                    Aston Donald McEachin
represents                               VA04
chamber                                 house
first_name                              Aston
last_name                            McEachin
Name: 5, dtype: object

In [153]:
house.shape

(14855, 11)

In [165]:
house[house['name'] == "Tom O'Halleran"]

Unnamed: 0,disclosure_date,transaction_date,ticker,asset_description,type,amount,name,represents,chamber,first_name,last_name,party,birthday,gender
6773,2020-04-17,2020-03-31,CAT,"Caterpillar, Inc.",sale_full,15000,Tom O'Halleran,AZ01,house,Tom,O'Halleran,,,
6774,2020-04-17,2020-03-31,NEM,Newmont Corporation,sale_full,15000,Tom O'Halleran,AZ01,house,Tom,O'Halleran,,,


In [160]:
house.loc[house['last_name'] == 'FACS', 'last_name'] = 'Dunn'
house.loc[house['last_name'] == 'Arenholz', 'last_name'] = 'Hinson'

In [197]:
relevant_legislators.loc[relevant_legislators['last_name'].str.contains('Halleran'), 'last_name'] = "O'Halleran"
relevant_legislators.loc[relevant_legislators['last_name'] == 'Sánchez', 'last_name'] = 'Sanchez'

In [198]:
house['party'] = ''
house['birthday'] = ''
house['gender'] = ''

name = set()
for i in house.index:
    #try:
    first_name = house.loc[i, 'first_name']
    last_name = house.loc[i, 'last_name']
    party = relevant_legislators[(relevant_legislators['first_name'] == first_name) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['party'].values
    birthday = relevant_legislators[(relevant_legislators['first_name'] == first_name) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['birthday'].values
    gender = relevant_legislators[(relevant_legislators['first_name'] == first_name) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['gender'].values
    if len(party) == 0:
        party = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['party'].values
        birthday = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['birthday'].values
        gender = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['gender'].values
    if len(party) == 0:
        name.add(str(first_name) + ' ' + str(last_name))
    house['party'][i] = party[0]
    house['birthday'][i] = birthday[0]
    house['gender'][i] = gender[0]
    #except:
        #print(i)
        #break
print(len(name))    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house['party'][i] = party[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house['birthday'][i] = birthday[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house['gender'][i] = gender[0]


0


In [196]:
name

{"Tom O'Halleran"}

In [182]:
relevant_legislators[relevant_legislators['last_name'] == 'Sanchez']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
11842,Sanchez,Loretta,B.,,Loretta Sanchez,1960-01-07,F,rep,CA,Democrat,46.0


In [184]:
#linda sanchez had an accento
relevant_legislators[relevant_legislators['first_name'] == 'Linda']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
189,Sánchez,Linda,T.,,Linda T. Sánchez,1969-01-28,F,rep,CA,Democrat,38.0
11208,Smith,Linda,,,,1950-07-16,F,rep,WA,Republican,3.0


In [183]:
house[house['last_name'] == 'Sanchez']

Unnamed: 0,disclosure_date,transaction_date,ticker,asset_description,type,amount,name,represents,chamber,first_name,last_name,party,birthday,gender
9730,2020-09-13,2019-02-22,BAC,Bank of America Corporation,sale_full,15000,Linda T. Sanchez,CA38,house,Linda,Sanchez,,,
9731,2020-09-13,2019-02-22,ENB,Enbridge Inc.,sale_full,15000,Linda T. Sanchez,CA38,house,Linda,Sanchez,,,
9732,2020-09-13,2019-02-22,F,Ford Motor Co.,sale_full,15000,Linda T. Sanchez,CA38,house,Linda,Sanchez,,,
9733,2020-09-13,2019-02-22,RTNB,Root 9 B Technologies Inc,sale_full,15000,Linda T. Sanchez,CA38,house,Linda,Sanchez,,,
9734,2020-09-13,2019-02-22,UBS,UBS Group AG,sale_full,15000,Linda T. Sanchez,CA38,house,Linda,Sanchez,,,


In [181]:
#finding O'Halleran character
str(relevant_legislators[relevant_legislators['last_name'].str.contains("Halleran")]['last_name'].str[1]).encode()

b"321    '\nName: last_name, dtype: object"

In [170]:
house[house['party'] == '']

Unnamed: 0,disclosure_date,transaction_date,ticker,asset_description,type,amount,name,represents,chamber,first_name,last_name,party,birthday,gender
6773,2020-04-17,2020-03-31,CAT,"Caterpillar, Inc.",sale_full,15000,Tom O'Halleran,AZ01,house,Tom,O'Halleran,,,
6774,2020-04-17,2020-03-31,NEM,Newmont Corporation,sale_full,15000,Tom O'Halleran,AZ01,house,Tom,O'Halleran,,,
6775,2020-07-14,2020-06-12,JNJ,Johnson & Johnson GLB 02.450% MAR,purchase,15000,David P. Roe,TN01,house,David,Roe,,,
6776,2020-07-14,2020-06-10,MCD,McDonalds Corp Ser MTN 03.700% JA,purchase,15000,David P. Roe,TN01,house,David,Roe,,,
6777,2020-07-14,2020-06-01,T,AT&T INC CLD GLB 02.45,sale_full,15000,David P. Roe,TN01,house,David,Roe,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14850,2020-06-10,2020-04-09,SWK,"Stanley Black & Decker, Inc.",sale_partial,15000,Ed Perlmutter,CO07,house,Ed,Perlmutter,,,
14851,2020-06-10,2020-04-09,USB,U.S. Bancorp,sale_partial,15000,Ed Perlmutter,CO07,house,Ed,Perlmutter,,,
14852,2020-06-10,2020-03-13,BMY,Bristol-Myers Squibb Company,sale_full,250000,Nicholas Van Taylor,TX03,house,Nicholas,Taylor,,,
14853,2020-06-10,2020-03-13,LLY,Eli Lilly and Company,sale_full,1000000,Nicholas Van Taylor,TX03,house,Nicholas,Taylor,,,


In [106]:
house[house['last_name'].isnull()]

Unnamed: 0,disclosure_date,transaction_date,ticker,asset_description,type,amount,name,represents,chamber,first_name,last_name,party,birthday,gender
0,2021-10-04,2021-09-27,BP,BP plc,purchase,15000,Virginia Foxx,NC05,house,Virginia,,,,
1,2021-10-04,2021-09-13,XOM,Exxon Mobil Corporation,purchase,15000,Virginia Foxx,NC05,house,Virginia,,,,
2,2021-10-04,2021-09-10,ILPT,Industrial Logistics Properties Trust - Common...,purchase,50000,Virginia Foxx,NC05,house,Virginia,,,,
3,2021-10-04,2021-09-28,PM,Phillip Morris International Inc,purchase,50000,Virginia Foxx,NC05,house,Virginia,,,,
4,2021-10-04,2021-09-17,BLK,BlackRock Inc,sale_partial,15000,Alan S. Lowenthal,CA47,house,Alan,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14850,2020-06-10,2020-04-09,SWK,"Stanley Black & Decker, Inc.",sale_partial,15000,Ed Perlmutter,CO07,house,Ed,,,,
14851,2020-06-10,2020-04-09,USB,U.S. Bancorp,sale_partial,15000,Ed Perlmutter,CO07,house,Ed,,,,
14852,2020-06-10,2020-03-13,BMY,Bristol-Myers Squibb Company,sale_full,250000,Nicholas Van Taylor,TX03,house,Nicholas,,,,
14853,2020-06-10,2020-03-13,LLY,Eli Lilly and Company,sale_full,1000000,Nicholas Van Taylor,TX03,house,Nicholas,,,,


In [94]:
relevant_legislators[relevant_legislators['district'] == 23]

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
130,McCarthy,Kevin,,,Kevin McCarthy,1965-01-26,M,rep,CA,Republican,23.0
201,Wasserman Schultz,Debbie,,,Debbie Wasserman Schultz,1966-09-27,F,rep,FL,Democrat,23.0
516,Gonzales,Ernest,Anthony,,Tony Gonzales,1980-10-10,M,rep,TX,Republican,23.0
10364,Rees,Thomas,Mankell,,,1925-03-26,M,rep,CA,Democrat,23.0
10388,Ammerman,Joseph,Scofield,,,1924-07-14,M,rep,PA,Democrat,23.0
10401,Caputo,Bruce,Faulkner,,,1943-08-07,M,rep,NY,Republican,23.0
10628,Mottl,Ronald,Milton,,,1934-02-06,M,rep,OH,Democrat,23.0
10878,Bustamante,Albert,Garza,,,1935-04-08,M,rep,TX,Democrat,23.0
11394,Bonilla,Henry,,,,1954-01-02,M,rep,TX,Republican,23.0
11575,McHugh,John,M.,,,1948-09-29,M,rep,NY,Republican,23.0


take last names plus districts

In [72]:
name

{'Aston McEachin',
 'Christopher Jacobs',
 'Cindy Axne',
 'Daniel Crenshaw',
 'David Cawthorn',
 'Debbie Schultz',
 'Felix Moore',
 'Greg Murphy',
 'Greg Steube',
 'James Banks',
 'James Costa',
 'James Hagedorn',
 'James Hill',
 'Kenneth Buck',
 'Linda Sanchez',
 'Michael Gallagher',
 'Michael Garcia',
 'Mr. Cox',
 'Mr. Franklin',
 'Mr. Meijer',
 'Mrs. Greene',
 'Neal FACS',
 'Nicholas Taylor',
 'None Arenholz',
 'None Jacobs',
 'None Manning',
 'None Newman',
 'None Ross',
 'None Spartz',
 'Richard Allen',
 'Rohit Khanna',
 'S. Krishnamoorthi',
 'Scott Franklin',
 "Tom O'Halleran"}

In [57]:
sherrod = relevant_legislators[(relevant_legislators['first_name'] == 'Sherrod') & (relevant_legislators['last_name'] == 'Brown')]['party'].values


In [58]:
sherrod

array(['Democrat'], dtype=object)

In [59]:
sherrod[0]

'Democrat'

In [187]:
house.head()

Unnamed: 0,disclosure_date,transaction_date,ticker,asset_description,type,amount,name,represents,chamber,first_name,last_name,party,birthday,gender
0,2021-10-04,2021-09-27,BP,BP plc,purchase,15000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
1,2021-10-04,2021-09-13,XOM,Exxon Mobil Corporation,purchase,15000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
2,2021-10-04,2021-09-10,ILPT,Industrial Logistics Properties Trust - Common...,purchase,50000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
3,2021-10-04,2021-09-28,PM,Phillip Morris International Inc,purchase,50000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
4,2021-10-04,2021-09-17,BLK,BlackRock Inc,sale_partial,15000,Alan S. Lowenthal,CA47,house,Alan,Lowenthal,Democrat,1941-03-08T00:00:00.000000000,M


In [45]:
house.dtypes

disclosure_date      datetime64[ns]
transaction_date     datetime64[ns]
ticker                       object
asset_description            object
type                         object
amount                        int64
name                         object
represents                   object
chamber                      object
first_name                   object
last_name                    object
party                        object
birthday                     object
gender                       object
dtype: object

In [119]:
#house['party'] = str(house['party'])

When I do the following based on it being regular string [r'[\W_]+'...], it tells me I can't use a string on a byte type, but when I do it with b for byte (as found on [this site](https://stackoverflow.com/a/61720425)) it tells me it's a string and can't use byte. ????????

house['party'] = house['party'].map(lambda x: x[0])
house['birthday'] = house['birthday'].map(lambda x: re.sub(br'[\W_]+', '', x))
house['gender'] = house['gender'].map(lambda x: re.sub(br'[\W_]+', '', x))

In [50]:
house['party'] = house['party'].map(lambda x: x.replace(['[', '']).replace(']', ''))
house['birthday'] = house['birthday'].map(lambda x: re.sub(br'[\W_]+', '', x))
house['gender'] = house['gender'].map(lambda x: re.sub(br'[\W_]+', '', x))

AttributeError: 'numpy.ndarray' object has no attribute 'replace'

In [201]:
house.dtypes

disclosure_date      datetime64[ns]
transaction_date     datetime64[ns]
ticker                       object
asset_description            object
type                         object
amount                        int64
name                         object
represents                   object
chamber                      object
first_name                   object
last_name                    object
party                        object
birthday                     object
gender                       object
dtype: object

In [207]:
house['birthday'] = house['birthday'].map(datetime())

TypeError: function missing required argument 'year' (pos 1)

In [199]:
house.head()

Unnamed: 0,disclosure_date,transaction_date,ticker,asset_description,type,amount,name,represents,chamber,first_name,last_name,party,birthday,gender
0,2021-10-04,2021-09-27,BP,BP plc,purchase,15000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
1,2021-10-04,2021-09-13,XOM,Exxon Mobil Corporation,purchase,15000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
2,2021-10-04,2021-09-10,ILPT,Industrial Logistics Properties Trust - Common...,purchase,50000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
3,2021-10-04,2021-09-28,PM,Phillip Morris International Inc,purchase,50000,Virginia Foxx,NC05,house,Virginia,Foxx,Republican,1943-06-29T00:00:00.000000000,F
4,2021-10-04,2021-09-17,BLK,BlackRock Inc,sale_partial,15000,Alan S. Lowenthal,CA47,house,Alan,Lowenthal,Democrat,1941-03-08T00:00:00.000000000,M


In [105]:
house.dtypes

disclosure_date      datetime64[ns]
transaction_date     datetime64[ns]
ticker                       object
asset_description            object
type                         object
amount                        int64
name                         object
represents                   object
chamber                      object
first_name                   object
last_name                    object
party                        object
dtype: object

In [366]:
house.to_csv('./loren_data/clean_house_2022-07-15.csv')