In [352]:
import pandas as pd
from datetime import datetime

In [441]:
pd.options.display.max_rows = None

In [479]:
house = pd.read_csv("./loren_data/house_2022-07-15.csv")
house.head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
0,2021,10/04/2021,2021-09-27,joint,BP,BP plc,purchase,"$1,001 - $15,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
1,2021,10/04/2021,2021-09-13,joint,XOM,Exxon Mobil Corporation,purchase,"$1,001 - $15,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
2,2021,10/04/2021,2021-09-10,joint,ILPT,Industrial Logistics Properties Trust - Common Shares of Beneficial Interest,purchase,"$15,001 - $50,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
3,2021,10/04/2021,2021-09-28,joint,PM,Phillip Morris International Inc,purchase,"$15,001 - $50,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
4,2021,10/04/2021,2021-09-17,self,BLK,BlackRock Inc,sale_partial,"$1,001 - $15,000",Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019570.pdf,False


In [480]:
house.dtypes

disclosure_year            int64
disclosure_date           object
transaction_date          object
owner                     object
ticker                    object
asset_description         object
type                      object
amount                    object
representative            object
district                  object
ptr_link                  object
cap_gains_over_200_usd      bool
dtype: object

# Nulls

In [481]:
house.isnull().sum()

disclosure_year              0
disclosure_date              0
transaction_date             0
owner                     5614
ticker                       0
asset_description            4
type                         0
amount                       0
representative               0
district                     0
ptr_link                     0
cap_gains_over_200_usd       0
dtype: int64

## Null Owners

In [482]:
house[house['owner'].isnull()].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
6,2021,12/01/2021,2021-11-30,,KPLTW,Katapult Holdings Inc - Warrant,purchase,"$1,001 - $15,000",Hon. Austin Scott,GA08,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019915.pdf,False
7,2021,12/01/2021,2021-11-18,,AMD,Advanced Micro Devices Inc,sale_full,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False
8,2021,12/01/2021,2021-11-18,,AAPL,Apple Inc,sale_full,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False
9,2021,12/01/2021,2021-11-24,,MSFT,Microsoft Corporation,purchase,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False
10,2021,12/01/2021,2021-11-24,,MS,Morgan Stanley,purchase,"$100,001 - $250,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False


In [483]:
percent_null = house.owner.isnull().sum()/len(house)
percent_null

0.3779198922921575

In [484]:
house['owner'].value_counts(dropna = False)

NaN          5614
joint        4635
self         2897
--           1315
dependent     394
Name: owner, dtype: int64

We decided to drop the `'owner'` column because there were so many nulls and '--'. For our purposes, to see if congresspeople are taking advantage of insider trading, the distinction of who in the congressperson's family technically owns the equity is unimportant. It would be interesting to explore ownership, but there's just too many nulls.

In [485]:
house.drop(columns = ['owner'], inplace = True)

## Null Asset Descriptions

In [486]:
pd.set_option('max_colwidth', 1000)

In [487]:
house[house['asset_description'].isnull()]

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
801,2021,02/10/2021,2021-01-12,BLL,,sale_partial,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018168.pdf,True
3933,2021,03/28/2021,2021-03-16,CELO,,purchase,"$1,001 - $15,000",Hon. Mark Dr Green,TN07,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018471.pdf,False
12597,2021,03/18/2021,2021-02-16,URGO,,purchase,"$15,001 - $50,000",Hon. Brian Mast,FL18,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018421.pdf,False
12598,2021,03/18/2021,2021-02-18,URGO,,purchase,"$1,001 - $15,000",Hon. Brian Mast,FL18,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018421.pdf,False


It's unclear what these are. We'll drop them, as a result.

BLL appears to be the Ball Corporation, which trades in Vienna as BLL but on the NYSE as BALL. It's not clear which it is.

CELO also appears to be a blockchain currency.

URGO could be UGRO or it could be an (apparently small) blockchain currency. If that's the case, it's possible Brian Mast owns a very signicant chunk of it ([see here](https://thebittimes.com/token-URGO-BSC-0x06b1927e40F04b5b6EBc353842B678011b766B8A.html)).

In [488]:
house.isnull().sum()

disclosure_year           0
disclosure_date           0
transaction_date          0
ticker                    0
asset_description         4
type                      0
amount                    0
representative            0
district                  0
ptr_link                  0
cap_gains_over_200_usd    0
dtype: int64

In [489]:
house.dropna(inplace = True)

In [490]:
house.isnull().sum()

disclosure_year           0
disclosure_date           0
transaction_date          0
ticker                    0
asset_description         0
type                      0
amount                    0
representative            0
district                  0
ptr_link                  0
cap_gains_over_200_usd    0
dtype: int64

# Datetimes

In [491]:
house['disclosure_date'] = pd.to_datetime(house['disclosure_date'], yearfirst=True)

We discovered some wonky years in the transaction dates. Fortunately, it was highly likely the disclosure year was the year of the transaction, so we substituted that for the year in the transaction date, which enabled us to convert to datetimes.

This [stackoverflow answer](https://stackoverflow.com/a/56968849) helped to understand how to pull the first item from within the list.

In [492]:
house['transaction_date'] = house['transaction_date'].str.split('-')

weird_years = house[(house['transaction_date'].str[0] != '2017') &
      (house['transaction_date'].str[0] != '2018') &
      (house['transaction_date'].str[0] != '2019') &
      (house['transaction_date'].str[0] != '2020') &
      (house['transaction_date'].str[0] != '2021') &
      (house['transaction_date'].str[0] != '2022')]

for i in weird_years.index:
    house.loc[i, 'transaction_date'][0] = str(house.loc[i, 'disclosure_year'])

In [493]:
#verifying the years have been fixed
house[(house['transaction_date'].str[0] != '2017') &
      (house['transaction_date'].str[0] != '2018') &
      (house['transaction_date'].str[0] != '2019') &
      (house['transaction_date'].str[0] != '2020') &
      (house['transaction_date'].str[0] != '2021') &
      (house['transaction_date'].str[0] != '2022')]

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd


In [494]:
house['transaction_date'] = house['transaction_date'].str.join('-')
house['transaction_date'] = pd.to_datetime(house['transaction_date'])
house.dtypes

disclosure_year                    int64
disclosure_date           datetime64[ns]
transaction_date          datetime64[ns]
ticker                            object
asset_description                 object
type                              object
amount                            object
representative                    object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object

In [495]:
#verifying the range of dates is what we expect
house['transaction_date'].describe(datetime_is_numeric=True)

count                            14851
mean     2020-11-29 16:54:54.808430336
min                2017-09-05 00:00:00
25%                2020-04-21 12:00:00
50%                2020-11-19 00:00:00
75%                2021-06-08 00:00:00
max                2022-12-31 00:00:00
Name: transaction_date, dtype: object

# Amounts

In [496]:
house['amount'].value_counts()

$1,001 - $15,000            10444
$15,001 - $50,000            2361
$50,001 - $100,000            750
$100,001 - $250,000           572
$250,001 - $500,000           243
$1,001 -                      242
$500,001 - $1,000,000         150
$1,000,001 - $5,000,000        41
$1,000,000 +                   30
$5,000,001 - $25,000,000        9
$1,000 - $15,000                4
$15,000 - $50,000               3
$50,000,000 +                   1
$1,000,000 - $5,000,000         1
Name: amount, dtype: int64

Review of .replace() from [here](https://www.symbiosisacademy.org/tutorial-index/pandas-search-replace-values-columns/)

In [497]:
house = house.replace(to_replace={'amount':['$1,001 -', '$1,000 - $15,000']}, value= '$1,001 - $15,000')
house = house.replace(to_replace={'amount':['$1,000,000 +', '$1,000,000 - $5,000,000']}, value= '$1,000,001 - $5,000,000')
house = house.replace(to_replace={'amount':['$15,000 - $50,000']}, value= '$15,001 - $50,000')
house['amount'].value_counts()

$1,001 - $15,000            10690
$15,001 - $50,000            2364
$50,001 - $100,000            750
$100,001 - $250,000           572
$250,001 - $500,000           243
$500,001 - $1,000,000         150
$1,000,001 - $5,000,000        72
$5,000,001 - $25,000,000        9
$50,000,000 +                   1
Name: amount, dtype: int64

In [498]:
house['amount'] = house['amount'].str.split(' ') 

for i in house.index:
    if len(house.loc[i, 'amount']) == 3:
        house.loc[i, 'amount'] = house.loc[i, 'amount'][2]
    else:
        house.loc[i, 'amount'] = house.loc[i, 'amount'][0]
        
house['amount'] = house['amount'].map(lambda x: x.replace('$', '').replace(',', ''))
house['amount'] = house['amount'].astype(int)
house['amount'].value_counts()

15000       10690
50000        2364
100000        750
250000        572
500000        243
1000000       150
5000000        72
25000000        9
50000000        1
Name: amount, dtype: int64

## Double Checking Data Types

In [499]:
house.dtypes

disclosure_year                    int64
disclosure_date           datetime64[ns]
transaction_date          datetime64[ns]
ticker                            object
asset_description                 object
type                              object
amount                             int64
representative                    object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object

# Ticker Symbols

In [500]:
house['ticker'].value_counts(dropna=False)

--               1208
MSFT              244
AAPL              181
NTAP              130
TDDXX             122
FB                115
AMZN              102
BRK.B             100
RUN                94
TSLA               89
NVDA               86
V                  75
JPM                72
DIS                72
BABA               71
T                  71
HD                 70
JNJ                68
XOM                62
PYPL               61
SBUX               60
GNRC               58
ACN                54
NFLX               54
ET                 54
VZ                 53
GOOG               52
COST               52
NEE                51
MMP                51
GOOGL              49
CVX                49
C                  48
USAC               48
SQ                 47
TJX                46
CMCSA              44
AM                 44
WFC                44
PFE                42
UNH                42
MA                 41
KO                 40
EL                 39
ENLC               39
NGL       

In [501]:
house[house['ticker'].isna()]

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd


## '--' Tickers
'ticker' == '--' is about 8% of our data

In [502]:
house['ticker'].value_counts().values[0]/len(house)

0.081341323816578

In [503]:
house[house['ticker'] == '--'].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
56,2020,2020-09-22,2020-08-17,--,Metallic Minerals Corp.,sale_partial,250000,Mr. TJ John (Tj) Cox,CA21,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20017403.pdf,True
57,2021,2021-03-23,2021-01-27,--,Zimmer Biomet Holdings,sale_partial,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018447.pdf,False
58,2021,2021-03-23,2021-02-08,--,Zimmer Biomet Holdings,sale_full,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018447.pdf,False
59,2021,2021-03-23,2021-02-08,--,Zimmer Biomet Holdings,purchase,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018448.pdf,False
61,2021,2021-03-23,2021-02-19,--,Celegene Corp,sale_full,50000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018450.pdf,False


## This is where we fill in as many of the '--' as we can

In [504]:
tickers = pd.read_excel('./loren_data/US-Stock-Symbols.xlsx')
tickers = tickers[['Symbol', 'Name']]
tickers.columns = tickers.columns.map(lambda x: x.lower())

tickers.head()

Unnamed: 0,symbol,name
0,AAPL,Apple Inc.
1,XOM,Exxon Mobil Corporation
2,MSFT,Microsoft Corporation
3,BAC^I,Bank of America Corporation
4,IBM,International Business Machines Corporation


## Adding Missing Names to the DataFrame
We found a number of asset descriptions that don't match exactly with items in the tickers dataframe, so we added those based on research. We have near 100% certainty on almost all of them. The one slight exception is "West Fargo N D", which appears 3 times. This appears to be Wells Fargo, the bank. That was a judgment call.

In [505]:
#not in .py file
df2 = pd.DataFrame({
    'symbol': ['FLX', 'TDDXX', 'NFG', 'PM', 'LIN', 'GPN', 'EQIX', 'PYPL', 'TDDXX', 'FSEN', 'TDDXX', 'ZBH', 'ORCL', 'UNH', 'EXPE',
              'ADYEY', 'WFC'],
    'name': ['Netflix Inc', 'BLF FedFund TDDXX', 'National Fuel Gas Company', 'Philip Morris International Inc', 
             'Linde plc Ordinary Share', 'Global Payments Inc', 'Equinix Inc', 'Paypal Holdings Inc', 'BLF FedFund TDD XX', 
             'FS Energy & Power Fund Common', 'BLF FedFun TDDXX', 'Zimmer Biomet Holdings', 'Oracle Corp', 'United Health',
             'Expedia Group Inc', 'ADYEN N V ADR', 'West Fargo N D']
})

tickers = pd.concat([tickers, df2])

In [506]:
for i in house.loc[house['ticker'] == '--']['asset_description']:
    symbol = tickers.loc[tickers['name'] == i, 'symbol'].values
    if len(symbol) != 0:
        house.loc[house['asset_description'] == i, 'ticker'] = symbol[0]

In [507]:
house.isnull().sum()

disclosure_year           0
disclosure_date           0
transaction_date          0
ticker                    0
asset_description         0
type                      0
amount                    0
representative            0
district                  0
ptr_link                  0
cap_gains_over_200_usd    0
dtype: int64

In [508]:
house['ticker'].value_counts(dropna = False)

--               1138
MSFT              244
AAPL              181
TDDXX             161
NTAP              130
FB                115
AMZN              102
BRK.B             100
RUN                94
TSLA               89
NVDA               86
V                  75
JPM                72
DIS                72
T                  71
BABA               71
HD                 70
JNJ                68
PYPL               62
XOM                62
SBUX               60
GNRC               58
ET                 54
ACN                54
VZ                 53
GOOG               52
COST               52
MMP                51
NEE                51
GOOGL              49
CVX                49
USAC               48
C                  48
SQ                 47
WFC                47
TJX                46
UNH                45
CMCSA              44
AM                 44
PFE                42
MA                 41
KO                 40
EL                 39
ENLC               39
NGL                39
PG        

In [509]:
house[house['ticker'] == '--'].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
56,2020,2020-09-22,2020-08-17,--,Metallic Minerals Corp.,sale_partial,250000,Mr. TJ John (Tj) Cox,CA21,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20017403.pdf,True
61,2021,2021-03-23,2021-02-19,--,Celegene Corp,sale_full,50000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018450.pdf,False
62,2021,2021-03-23,2021-02-16,--,Outfront Media,sale_full,50000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018450.pdf,False
73,2021,2021-01-26,2021-01-21,--,Spectra Energy Capital LLC,sale_full,15000,Hon. Mo Brooks,AL05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018077.pdf,True
94,2021,2021-02-25,2021-01-14,--,Oregon State Health and Science University,purchase,50000,Hon. Vern Buchanan,FL16,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018280.pdf,False


In [510]:
house[house['ticker'] == '--']['asset_description'].value_counts(dropna=False)

US Treasury Notes                                                                                                                                  8
United States Treasury Notes                                                                                                                       8
U.S. Treasury Bills                                                                                                                                6
Indiana Bd Bk Rev                                                                                                                                  6
JPMORGAN FEDERAL MONEY MARKET FUND                                                                                                                 5
Round Rock Tex Indpt SCH Dist 5.00% Due 08/01/25                                                                                                   4
Triborough Brdg & Tunl Auth N                                                                             

While there are certainly equities in the remaining '--' tickers, the vast majority appear to be bonds or other non-equity holdings. As there's no way to scale the search for additional equities at this point, we decided to drop the '--' rows.

In [511]:
len(house)

14851

In [512]:
len(house[house['ticker'] == '--'])

1138

In [513]:
house = house[house['ticker'] != '--']
len(house)

13713

In [514]:
14851- 1138

13713

# 'type'
We examined the 'exchanges' in both this data set and in our Senate dataset and determined we should eliminate the 'exchanges' because it was impossible to know what the exchange was for these values and incorporating the Senate information (which included both items in the exchange) would be too difficult. Given that it's only ~100 items in each data set, we decided we could afford to remove these data points.

We decided to convert the one 'sale' to a 'sale_full' because the majority of sales are full.

In [515]:
house['type'].value_counts()

purchase        7109
sale_full       4355
sale_partial    2146
exchange         102
sale               1
Name: type, dtype: int64

In [516]:
house[house['type'] == 'exchange']

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
250,2021,2021-04-23,2021-03-16,SON,Sonoco Products Company,exchange,15000,Hon. Lois Frankel,FL21,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018606.pdf,False
675,2021,2021-05-10,2021-04-21,MRVL,Marvell Technology Inc,exchange,15000,Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018701.pdf,False
892,2020,2020-08-11,2019-07-01,LHX,"L3Harris Technologies, Inc.",exchange,15000,Hon. Grace Meng,NY06,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20017020.pdf,False
1353,2020,2020-04-10,2020-03-02,IR,Ingersoll Rand Inc.,exchange,15000,Hon. Dean Phillips,MN03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20016369.pdf,False
1723,2020,2020-03-10,2020-02-04,PLD,"ProLogis, Inc.",exchange,15000,Hon. Zoe Lofgren,CA19,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20016162.pdf,False
1781,2021,2021-08-03,2021-01-19,STLA,Stellantis NV Common Shares,exchange,15000,Hon. August Lee Pfluger,TX11,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019227.pdf,False
1782,2021,2021-08-03,2021-01-19,STLA,Stellantis NV Common Shares,exchange,15000,Hon. August Lee Pfluger,TX11,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019227.pdf,False
2126,2022,2022-03-18,2022-02-15,AMD,Advanced Micro Devices Inc,exchange,15000,Hon. Christopher L. Jacobs,NY27,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020654.pdf,False
2262,2021,2021-03-10,2021-03-01,MS,Morgan Stanley,exchange,50000,Hon. Debbie Dingell,MI12,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018361.pdf,False
2348,2022,2022-06-09,2022-02-09,IX,Net Zero PLC - IX investments LLC merged with Net Zero PLC,exchange,500000,Hon. Jamie Raskin,MD08,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020973.pdf,False


In [517]:
len(house)

13713

In [518]:
house = house[house['type'] != 'exchange']

In [519]:
len(house)

13611

In [520]:
house[house['type'] == 'sale']

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
9565,2021,2021-10-14,2021-10-12,T,AT&T Inc,sale,15000,Hon. Mo Brooks,AL05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019626.pdf,False


In [521]:
house.loc[house['type'] == 'sale', 'type'] = 'sale_full'

In [522]:
house['type'].value_counts()

purchase        7109
sale_full       4356
sale_partial    2146
Name: type, dtype: int64

# Looking at .describe() as a sanity check

In [523]:
house.describe()

Unnamed: 0,disclosure_year,amount
count,13611.0,13611.0
mean,2020.630593,70733.23
std,0.703754,666269.0
min,2020.0,15000.0
25%,2020.0,15000.0
50%,2020.0,15000.0
75%,2021.0,15000.0
max,2022.0,25000000.0


It appears our $50,000,000 transaction disappeared -- it presumably didn't have a ticker.

We need our columns to agree with the Senate dataframe columns, so we made the following changes. While we erred on the side of keeping information for now, we dropped the 'ptr_link' because we don't anticipate doing any analysis on it. If we need it, we'll come back and reference it from the csv or from prior to this point.

This [stackoverflow answer](https://stackoverflow.com/a/54195568) helped us to find the differences between the lists of columns 

In [524]:
house_cols = list(house.columns)

senate_cols = ['transaction_date', 'owner', 'ticker', 'asset_description',
       'asset_type', 'type', 'amount', 'comment', 'senator', 'ptr_link',
       'disclosure_date', 'year'] 

house_different_cols = list(set(house_cols).difference(senate_cols))
print(f'In House, not in Senate: {house_different_cols}')

senate_different_cols = list(set(senate_cols).difference(house_cols))
print(f'In Senate, not in House: {senate_different_cols}')

In House, not in Senate: ['representative', 'district', 'disclosure_year', 'cap_gains_over_200_usd']
In Senate, not in House: ['senator', 'owner', 'asset_type', 'year', 'comment']


In [525]:
house['chamber'] = 'house'
house.drop(columns = ['ptr_link', 'cap_gains_over_200_usd', 'disclosure_year'], inplace = True)
house.rename(columns = {'representative': 'name', 'district': 'represents'}, inplace = True)

In [526]:
house_cols = list(house.columns)

senate_cols = ['transaction_date', 'owner', 'ticker', 'asset_description',
       'asset_type', 'type', 'amount', 'comment', 'senator', 'ptr_link',
       'disclosure_date', 'year'] 

house_different_cols = list(set(house_cols).difference(senate_cols))
print(f'In House, not in Senate: {house_different_cols}')

senate_different_cols = list(set(senate_cols).difference(house_cols))
print(f'In Senate, not in House: {senate_different_cols}')

In House, not in Senate: ['name', 'chamber', 'represents']
In Senate, not in House: ['senator', 'owner', 'ptr_link', 'asset_type', 'year', 'comment']


The following two data sets came from [this github](https://github.com/unitedstates/congress-legislators/).

In [527]:
current_legislators = pd.read_csv('./loren_data/legislators-current.csv')
current_legislators.shape

(537, 34)

In [528]:
current_leg_cols = list(current_legislators.columns)
current_leg_cols

['last_name',
 'first_name',
 'middle_name',
 'suffix',
 'nickname',
 'full_name',
 'birthday',
 'gender',
 'type',
 'state',
 'district',
 'senate_class',
 'party',
 'url',
 'address',
 'phone',
 'contact_form',
 'rss_url',
 'twitter',
 'facebook',
 'youtube',
 'youtube_id',
 'bioguide_id',
 'thomas_id',
 'opensecrets_id',
 'lis_id',
 'fec_ids',
 'cspan_id',
 'govtrack_id',
 'votesmart_id',
 'ballotpedia_id',
 'washington_post_id',
 'icpsr_id',
 'wikipedia_id']

In [529]:
historical_legislators = pd.read_csv('./loren_data/legislators-historical.csv')
historical_legislators.shape

(12056, 34)

In [530]:
historical_leg_cols = list(historical_legislators.columns)
historical_leg_cols

['last_name',
 'first_name',
 'middle_name',
 'suffix',
 'nickname',
 'full_name',
 'birthday',
 'gender',
 'type',
 'state',
 'district',
 'senate_class',
 'party',
 'url',
 'address',
 'phone',
 'contact_form',
 'rss_url',
 'twitter',
 'facebook',
 'youtube',
 'youtube_id',
 'bioguide_id',
 'thomas_id',
 'opensecrets_id',
 'lis_id',
 'fec_ids',
 'cspan_id',
 'govtrack_id',
 'votesmart_id',
 'ballotpedia_id',
 'washington_post_id',
 'icpsr_id',
 'wikipedia_id']

In [531]:
current_diff_cols = list(set(current_leg_cols).difference(historical_leg_cols))
current_diff_cols

[]

The above determines the columns are the same. We shrank the historical ones to eliminate irrelevant entries. Making birthdays datetime. Referenced [this site](https://www.w3schools.com/python/python_datetime.asp).

In [532]:
historical_legislators['birthday'] = pd.to_datetime(historical_legislators['birthday'], format = '%Y-%m-%d')
current_legislators['birthday'] = pd.to_datetime(current_legislators['birthday'], format = '%Y-%m-%d')
historical_legislators = historical_legislators[historical_legislators['birthday'] > datetime(1922,1, 1)]

relevant_legislators = pd.concat([current_legislators, historical_legislators])
relevant_legislators = relevant_legislators[['last_name', 'first_name', 'middle_name', 'suffix',
                                             'full_name', 'birthday', 'gender', 'type', 'state', 'party', 'district']]

In [533]:
relevant_legislators.dtypes

last_name              object
first_name             object
middle_name            object
suffix                 object
full_name              object
birthday       datetime64[ns]
gender                 object
type                   object
state                  object
party                  object
district              float64
dtype: object

The following is to build a first_name and last_name column in the house dataframe to make it easier to search for the representative in the relevant_legislators dataframe. This worked for the fast majority of representatives/instances, but there were a few representatives for whom it identified the incorrect first and/or last name, as well as some dirty data (i.e. 'None' at the start of the name, which became the first name). That's cleaned below.

In [534]:
house['name'] = house['name'].map(lambda x: x.replace("Hon. ", ""))
house['first_name'] = house['name'].map(lambda x: x.split()[0])
house['last_name'] = house['name'].map(lambda x: x.split()[-1])

There were quite a number of representatives that needed special treatment to get the data from the relevant_representatives dataframe into the house dataframe. 

They were: {'Aston McEachin', 'Christopher Jacobs', 'Cindy Axne', 'Daniel Crenshaw', 'David Cawthorn', 'Debbie Schultz', 'Felix Moore', 'Greg Murphy','Greg Steube','James Banks','James Costa','James Hagedorn', 'James Hill', 'Kenneth Buck', 'Linda Sanchez', 'Michael Gallagher', 'Michael Garcia', 'Mr. Cox', 'Mr. Franklin', 'Mr. Meijer', 'Mrs. Greene', 'Neal FACS', 'Nicholas Taylor', 'None Arenholz', 'None Jacobs', 'None Manning', 'None Newman', 'None Ross', 'None Spartz', 'Richard Allen', 'Rohit Khanna', 'S. Krishnamoorthi', 'Scott Franklin', "Tom O'Halleran"}

We added the second 'if' statement within the for-loop to use last name and district number if first and last name didn't work. For several of the reps this still didn't work. The rewrites to the two dataframes that appear before the for-loop were necessary to address those errors.

Ben Peck was instrumental to solving the error that the for-loop threw when those names were incorrect, and helped with finding and fixing the specific errors at office hours 7/19/22.

In [535]:
house.loc[house['last_name'] == 'FACS', 'last_name'] = 'Dunn'
house.loc[house['last_name'] == 'Arenholz', 'last_name'] = 'Hinson'

relevant_legislators.loc[relevant_legislators['last_name'].str.contains('Halleran'), 'last_name'] = "O'Halleran"
relevant_legislators.loc[relevant_legislators['last_name'] == 'Sánchez', 'last_name'] = 'Sanchez'

In [536]:
house['party'] = ''
house['birthday'] = ''
house['gender'] = ''

name = set()
for i in house.index:
    #try:
    first_name = house.loc[i, 'first_name']
    last_name = house.loc[i, 'last_name']
    party = relevant_legislators[(relevant_legislators['first_name'] == first_name) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['party'].values
    birthday = relevant_legislators[(relevant_legislators['first_name'] == first_name) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['birthday'].values
    gender = relevant_legislators[(relevant_legislators['first_name'] == first_name) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['gender'].values
    if len(party) == 0:
        party = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['party'].values
        birthday = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['birthday'].values
        gender = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['gender'].values
    if len(party) == 0:
        name.add(str(first_name) + ' ' + str(last_name))
    house.loc[i, 'party'] = party[0]
    house.loc[i, 'birthday'] = birthday[0]
    house.loc[i, 'gender'] = gender[0]
    #except:
        #print(i)
        #break
print(len(name))    

0


In [537]:
house['birthday'] = pd.to_datetime(house['birthday'], format = '%Y-%m-%d')
house.drop(columns = ['first_name', 'last_name'], inplace = True)

In [542]:
relevant_legislators.to_csv('./loren_data/relevant_legislators-v1.csv', index = False)

In [540]:
tickers.to_csv('./loren_data/updated_ticker_symbols-v1.csv', index = False)

In [541]:
house.to_csv('./loren_data/clean_house_2022-07-15.csv', index = False)