In [231]:
import pandas as pd
from datetime import datetime
import re

In [232]:
#a team member needs this on their computer to get this to run, so we included it here
proxies = {'https': 'http://127.0.0.1:7769'}

In [233]:
pd.options.display.max_rows = None

In [234]:
house = pd.read_csv("data/house_2022-07-15.csv")
house.head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
0,2021,10/04/2021,2021-09-27,joint,BP,BP plc,purchase,"$1,001 - $15,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
1,2021,10/04/2021,2021-09-13,joint,XOM,Exxon Mobil Corporation,purchase,"$1,001 - $15,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
2,2021,10/04/2021,2021-09-10,joint,ILPT,Industrial Logistics Properties Trust - Common Shares of Beneficial Interest,purchase,"$15,001 - $50,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
3,2021,10/04/2021,2021-09-28,joint,PM,Phillip Morris International Inc,purchase,"$15,001 - $50,000",Hon. Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019557.pdf,False
4,2021,10/04/2021,2021-09-17,self,BLK,BlackRock Inc,sale_partial,"$1,001 - $15,000",Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019570.pdf,False


In [235]:
house.dtypes

disclosure_year            int64
disclosure_date           object
transaction_date          object
owner                     object
ticker                    object
asset_description         object
type                      object
amount                    object
representative            object
district                  object
ptr_link                  object
cap_gains_over_200_usd      bool
dtype: object

# Nulls

In [236]:
house.isnull().sum()

disclosure_year              0
disclosure_date              0
transaction_date             0
owner                     5614
ticker                       0
asset_description            4
type                         0
amount                       0
representative               0
district                     0
ptr_link                     0
cap_gains_over_200_usd       0
dtype: int64

## Null Owners

In [237]:
house[house['owner'].isnull()].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
6,2021,12/01/2021,2021-11-30,,KPLTW,Katapult Holdings Inc - Warrant,purchase,"$1,001 - $15,000",Hon. Austin Scott,GA08,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019915.pdf,False
7,2021,12/01/2021,2021-11-18,,AMD,Advanced Micro Devices Inc,sale_full,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False
8,2021,12/01/2021,2021-11-18,,AAPL,Apple Inc,sale_full,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False
9,2021,12/01/2021,2021-11-24,,MSFT,Microsoft Corporation,purchase,"$50,001 - $100,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False
10,2021,12/01/2021,2021-11-24,,MS,Morgan Stanley,purchase,"$100,001 - $250,000",Hon. Thomas Suozzi,NY03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019917.pdf,False


In [238]:
percent_null = house.owner.isnull().sum()/len(house)
percent_null

0.3779198922921575

In [239]:
house['owner'].value_counts(dropna = False)

NaN          5614
joint        4635
self         2897
--           1315
dependent     394
Name: owner, dtype: int64

We decided to drop the `'owner'` column because there were so many nulls and '--'. For our purposes, to see if congresspeople are taking advantage of insider trading, the distinction of who in the congressperson's family technically owns the equity is unimportant. It would be interesting to explore ownership, but there's just too many nulls.

In [240]:
house.drop(columns = ['owner'], inplace = True)

## Null Asset Descriptions

In [241]:
pd.set_option('max_colwidth', 1000)

In [242]:
house[house['asset_description'].isnull()]

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
801,2021,02/10/2021,2021-01-12,BLL,,sale_partial,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018168.pdf,True
3933,2021,03/28/2021,2021-03-16,CELO,,purchase,"$1,001 - $15,000",Hon. Mark Dr Green,TN07,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018471.pdf,False
12597,2021,03/18/2021,2021-02-16,URGO,,purchase,"$15,001 - $50,000",Hon. Brian Mast,FL18,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018421.pdf,False
12598,2021,03/18/2021,2021-02-18,URGO,,purchase,"$1,001 - $15,000",Hon. Brian Mast,FL18,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018421.pdf,False


It's unclear what these are. We'll drop them, as a result.

BLL appears to be the Ball Corporation, which trades in Vienna as BLL but on the NYSE as BALL. It's not clear which it is.

CELO also appears to be a blockchain currency.

URGO could be UGRO or it could be an (apparently small) blockchain currency. If that's the case, it's possible Brian Mast owns a very signicant chunk of it ([see here](https://thebittimes.com/token-URGO-BSC-0x06b1927e40F04b5b6EBc353842B678011b766B8A.html)).

In [243]:
house.isnull().sum()

disclosure_year           0
disclosure_date           0
transaction_date          0
ticker                    0
asset_description         4
type                      0
amount                    0
representative            0
district                  0
ptr_link                  0
cap_gains_over_200_usd    0
dtype: int64

In [244]:
house.dropna(inplace = True)

In [245]:
house.isnull().sum()

disclosure_year           0
disclosure_date           0
transaction_date          0
ticker                    0
asset_description         0
type                      0
amount                    0
representative            0
district                  0
ptr_link                  0
cap_gains_over_200_usd    0
dtype: int64

# Datetimes

In [246]:
house['disclosure_date'] = pd.to_datetime(house['disclosure_date'], yearfirst=True)

We discovered some wonky years in the transaction dates. Fortunately, it was highly likely the disclosure year was the year of the transaction, so we substituted that for the year in the transaction date, which enabled us to convert to datetimes.

This [stackoverflow answer](https://stackoverflow.com/a/56968849) helped to understand how to pull the first item from within the list.

In [247]:
house['transaction_date'] = house['transaction_date'].str.split('-')

weird_years = house[(house['transaction_date'].str[0] != '2017') &
      (house['transaction_date'].str[0] != '2018') &
      (house['transaction_date'].str[0] != '2019') &
      (house['transaction_date'].str[0] != '2020') &
      (house['transaction_date'].str[0] != '2021') &
      (house['transaction_date'].str[0] != '2022')]

for i in weird_years.index:
    house.loc[i, 'transaction_date'][0] = str(house.loc[i, 'disclosure_year'])

In [248]:
#verifying the years have been fixed
house[(house['transaction_date'].str[0] != '2017') &
      (house['transaction_date'].str[0] != '2018') &
      (house['transaction_date'].str[0] != '2019') &
      (house['transaction_date'].str[0] != '2020') &
      (house['transaction_date'].str[0] != '2021') &
      (house['transaction_date'].str[0] != '2022')]

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd


In [249]:
house['transaction_date'] = house['transaction_date'].str.join('-')
house['transaction_date'] = pd.to_datetime(house['transaction_date'])
house.dtypes

disclosure_year                    int64
disclosure_date           datetime64[ns]
transaction_date          datetime64[ns]
ticker                            object
asset_description                 object
type                              object
amount                            object
representative                    object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object

In [250]:
#verifying the range of dates is what we expect
house['transaction_date'].describe(datetime_is_numeric=True)

count                            14851
mean     2020-11-29 16:54:54.808430336
min                2017-09-05 00:00:00
25%                2020-04-21 12:00:00
50%                2020-11-19 00:00:00
75%                2021-06-08 00:00:00
max                2022-12-31 00:00:00
Name: transaction_date, dtype: object

# Amounts

In [251]:
house['amount'].value_counts()

$1,001 - $15,000            10444
$15,001 - $50,000            2361
$50,001 - $100,000            750
$100,001 - $250,000           572
$250,001 - $500,000           243
$1,001 -                      242
$500,001 - $1,000,000         150
$1,000,001 - $5,000,000        41
$1,000,000 +                   30
$5,000,001 - $25,000,000        9
$1,000 - $15,000                4
$15,000 - $50,000               3
$50,000,000 +                   1
$1,000,000 - $5,000,000         1
Name: amount, dtype: int64

Review of .replace() from [here](https://www.symbiosisacademy.org/tutorial-index/pandas-search-replace-values-columns/)

In [252]:
house = house.replace(to_replace={'amount':['$1,001 -', '$1,000 - $15,000']}, value= '$1,001 - $15,000')
house = house.replace(to_replace={'amount':['$1,000,000 +', '$1,000,000 - $5,000,000']}, value= '$1,000,001 - $5,000,000')
house = house.replace(to_replace={'amount':['$15,000 - $50,000']}, value= '$15,001 - $50,000')
house['amount'].value_counts()

$1,001 - $15,000            10690
$15,001 - $50,000            2364
$50,001 - $100,000            750
$100,001 - $250,000           572
$250,001 - $500,000           243
$500,001 - $1,000,000         150
$1,000,001 - $5,000,000        72
$5,000,001 - $25,000,000        9
$50,000,000 +                   1
Name: amount, dtype: int64

In [253]:
house['amount'] = house['amount'].str.split(' ') 

for i in house.index:
    if len(house.loc[i, 'amount']) == 3:
        house.loc[i, 'amount'] = house.loc[i, 'amount'][2]
    else:
        house.loc[i, 'amount'] = house.loc[i, 'amount'][0]
        
house['amount'] = house['amount'].map(lambda x: x.replace('$', '').replace(',', ''))
house['amount'] = house['amount'].astype(int)
house['amount'].value_counts()

15000       10690
50000        2364
100000        750
250000        572
500000        243
1000000       150
5000000        72
25000000        9
50000000        1
Name: amount, dtype: int64

## Double Checking Data Types

In [254]:
house.dtypes

disclosure_year                    int64
disclosure_date           datetime64[ns]
transaction_date          datetime64[ns]
ticker                            object
asset_description                 object
type                              object
amount                             int64
representative                    object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object

# Ticker Symbols

In [255]:
house['ticker'].value_counts(dropna=False)

--               1208
MSFT              244
AAPL              181
NTAP              130
TDDXX             122
FB                115
AMZN              102
BRK.B             100
RUN                94
TSLA               89
NVDA               86
V                  75
JPM                72
DIS                72
BABA               71
T                  71
HD                 70
JNJ                68
XOM                62
PYPL               61
SBUX               60
GNRC               58
ACN                54
NFLX               54
ET                 54
VZ                 53
GOOG               52
COST               52
NEE                51
MMP                51
GOOGL              49
CVX                49
C                  48
USAC               48
SQ                 47
TJX                46
CMCSA              44
AM                 44
WFC                44
PFE                42
UNH                42
MA                 41
KO                 40
EL                 39
ENLC               39
NGL       

In [256]:
house[house['ticker'].isna()]

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd


## '--' Tickers
'ticker' == '--' is about 8% of our data

In [257]:
house['ticker'].value_counts().values[0]/len(house)

0.081341323816578

In [258]:
house[house['ticker'] == '--'].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
56,2020,2020-09-22,2020-08-17,--,Metallic Minerals Corp.,sale_partial,250000,Mr. TJ John (Tj) Cox,CA21,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20017403.pdf,True
57,2021,2021-03-23,2021-01-27,--,Zimmer Biomet Holdings,sale_partial,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018447.pdf,False
58,2021,2021-03-23,2021-02-08,--,Zimmer Biomet Holdings,sale_full,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018447.pdf,False
59,2021,2021-03-23,2021-02-08,--,Zimmer Biomet Holdings,purchase,15000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018448.pdf,False
61,2021,2021-03-23,2021-02-19,--,Celegene Corp,sale_full,50000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018450.pdf,False


## This is where we fill in as many of the '--' as we can

In [259]:
tickers = pd.read_excel('data/US-Stock-Symbols.xlsx')
tickers = tickers[['Symbol', 'Name']]
tickers.columns = tickers.columns.map(lambda x: x.lower())

tickers.head()

Unnamed: 0,symbol,name
0,AAPL,Apple Inc.
1,XOM,Exxon Mobil Corporation
2,MSFT,Microsoft Corporation
3,BAC^I,Bank of America Corporation
4,IBM,International Business Machines Corporation


## this code in Senate helped with cleanup, so trying it out in the house.

In [260]:
house['asset_description'] = house['asset_description'].str.replace(' &amp;', '')
house['asset_description'] = house['asset_description'].str.replace(' Common Stock', '')
house['asset_description'] = house['asset_description'].str.replace(' CMN', '')
house['asset_description'] = house['asset_description'].str.replace(' ETF', '')

## Adding Missing Names to the DataFrame
We found a number of asset descriptions that don't match exactly with items in the tickers dataframe, so we added those based on research. We have near 100% certainty on almost all of them. The one slight exception is "West Fargo N D", which appears 3 times. This appears to be Wells Fargo, the bank. That was a judgment call.

The second set of names is from cleaning the Senate dataframe.

In [261]:
df2 = pd.DataFrame({
    'symbol': ['FLX', 'TDDXX', 'NFG', 'PM', 'LIN', 'GPN', 'EQIX', 'PYPL', 'TDDXX', 'FSEN', 'TDDXX', 'ZBH', 'ORCL', 'UNH', 'EXPE',
              'ADYEY', 'WFC'],
    'name': ['Netflix Inc', 'BLF FedFund TDDXX', 'National Fuel Gas Company', 'Philip Morris International Inc', 
             'Linde plc Ordinary Share', 'Global Payments Inc', 'Equinix Inc', 'Paypal Holdings Inc', 'BLF FedFund TDD XX', 
             'FS Energy & Power Fund Common', 'BLF FedFun TDDXX', 'Zimmer Biomet Holdings', 'Oracle Corp', 'United Health',
             'Expedia Group Inc', 'ADYEN N V ADR', 'West Fargo N D']
})

df3 = pd.DataFrame({
    'symbol': ['CAB', 'PFE', 'AAPL', 'AMZN', 'NKE', 'PCP', 'UAA', 'LIT', 'NFLX', 'FEZ', 'CIT', 'LNT', 'WW', 'UAA', 'PHLD'],
    'name': ["Cabela's Inc", 'Pfizer Inc', 'aapl', 'Amazon', 'Nike Inc B', 'Precision Castparts Corp', 
             'Under Armour Inc', 'Global X Lithium Battery', 'nflx', 'SPDR Euro Stoxx 50', 'CIT Group Inc (CIT)',
            'Alliant Energy Corp', 'Weight Watchers Intl Inc', 'Under Armour Inc Cl A', 'PHLD - Phillips Edison Grocery Center REIT I']
})

tickers = pd.concat([tickers, df2, df3])

In [262]:
for i in house.loc[house['ticker'] == '--']['asset_description']:
    symbol = tickers.loc[tickers['name'] == i, 'symbol'].values
    if len(symbol) != 0:
        house.loc[house['asset_description'] == i, 'ticker'] = symbol[0]

In [263]:
house.isnull().sum()

disclosure_year           0
disclosure_date           0
transaction_date          0
ticker                    0
asset_description         0
type                      0
amount                    0
representative            0
district                  0
ptr_link                  0
cap_gains_over_200_usd    0
dtype: int64

In [264]:
house['ticker'].value_counts(dropna = False)

--               1138
MSFT              244
AAPL              181
TDDXX             161
NTAP              130
FB                115
AMZN              102
BRK.B             100
RUN                94
TSLA               89
NVDA               86
V                  75
JPM                72
DIS                72
T                  71
BABA               71
HD                 70
JNJ                68
PYPL               62
XOM                62
SBUX               60
GNRC               58
ET                 54
ACN                54
VZ                 53
GOOG               52
COST               52
MMP                51
NEE                51
GOOGL              49
CVX                49
USAC               48
C                  48
SQ                 47
WFC                47
TJX                46
UNH                45
CMCSA              44
AM                 44
PFE                42
MA                 41
KO                 40
EL                 39
ENLC               39
NGL                39
PG        

In [265]:
house[house['ticker'] == '--'].head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
56,2020,2020-09-22,2020-08-17,--,Metallic Minerals Corp.,sale_partial,250000,Mr. TJ John (Tj) Cox,CA21,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20017403.pdf,True
61,2021,2021-03-23,2021-02-19,--,Celegene Corp,sale_full,50000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018450.pdf,False
62,2021,2021-03-23,2021-02-16,--,Outfront Media,sale_full,50000,Hon. Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018450.pdf,False
73,2021,2021-01-26,2021-01-21,--,Spectra Energy Capital LLC,sale_full,15000,Hon. Mo Brooks,AL05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018077.pdf,True
94,2021,2021-02-25,2021-01-14,--,Oregon State Health and Science University,purchase,50000,Hon. Vern Buchanan,FL16,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018280.pdf,False


In [266]:
house[house['ticker'] == '--']['asset_description'].value_counts(dropna=False)

US Treasury Notes                                                                                                                                  8
United States Treasury Notes                                                                                                                       8
U.S. Treasury Bills                                                                                                                                6
Indiana Bd Bk Rev                                                                                                                                  6
JPMORGAN FEDERAL MONEY MARKET FUND                                                                                                                 5
Round Rock Tex Indpt SCH Dist 5.00% Due 08/01/25                                                                                                   4
Triborough Brdg & Tunl Auth N                                                                             

While there are certainly equities in the remaining '--' tickers, the vast majority appear to be bonds or other non-equity holdings. As there's no way to scale the search for additional equities at this point, we decided to drop the '--' rows.

In [267]:
len(house)

14851

In [268]:
len(house[house['ticker'] == '--'])

1138

In [269]:
house = house[house['ticker'] != '--']
len(house)

13713

In [270]:
14851- 1138

13713

# 'type'
We examined the 'exchanges' in both this data set and in our Senate dataset and determined we should eliminate the 'exchanges' because it was impossible to know what the exchange was for these values and incorporating the Senate information (which included both items in the exchange) would be too difficult. Given that it's only ~100 items in each data set, we decided we could afford to remove these data points.

We decided to convert the one 'sale' to a 'sale_full' because the majority of sales are full.

In [271]:
house['type'].value_counts()

purchase        7109
sale_full       4355
sale_partial    2146
exchange         102
sale               1
Name: type, dtype: int64

In [272]:
house[house['type'] == 'exchange']

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
250,2021,2021-04-23,2021-03-16,SON,Sonoco Products Company,exchange,15000,Hon. Lois Frankel,FL21,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018606.pdf,False
675,2021,2021-05-10,2021-04-21,MRVL,Marvell Technology Inc,exchange,15000,Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018701.pdf,False
892,2020,2020-08-11,2019-07-01,LHX,"L3Harris Technologies, Inc.",exchange,15000,Hon. Grace Meng,NY06,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20017020.pdf,False
1353,2020,2020-04-10,2020-03-02,IR,Ingersoll Rand Inc.,exchange,15000,Hon. Dean Phillips,MN03,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20016369.pdf,False
1723,2020,2020-03-10,2020-02-04,PLD,"ProLogis, Inc.",exchange,15000,Hon. Zoe Lofgren,CA19,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20016162.pdf,False
1781,2021,2021-08-03,2021-01-19,STLA,Stellantis NV Common Shares,exchange,15000,Hon. August Lee Pfluger,TX11,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019227.pdf,False
1782,2021,2021-08-03,2021-01-19,STLA,Stellantis NV Common Shares,exchange,15000,Hon. August Lee Pfluger,TX11,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019227.pdf,False
2126,2022,2022-03-18,2022-02-15,AMD,Advanced Micro Devices Inc,exchange,15000,Hon. Christopher L. Jacobs,NY27,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020654.pdf,False
2262,2021,2021-03-10,2021-03-01,MS,Morgan Stanley,exchange,50000,Hon. Debbie Dingell,MI12,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018361.pdf,False
2348,2022,2022-06-09,2022-02-09,IX,Net Zero PLC - IX investments LLC merged with Net Zero PLC,exchange,500000,Hon. Jamie Raskin,MD08,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020973.pdf,False


In [273]:
len(house)

13713

In [274]:
house = house[house['type'] != 'exchange']

In [275]:
len(house)

13611

In [276]:
house[house['type'] == 'sale']

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd
9565,2021,2021-10-14,2021-10-12,T,AT&T Inc,sale,15000,Hon. Mo Brooks,AL05,https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20019626.pdf,False


In [277]:
house.loc[house['type'] == 'sale', 'type'] = 'sale_full'

In [278]:
house['type'].value_counts()

purchase        7109
sale_full       4356
sale_partial    2146
Name: type, dtype: int64

# Looking at .describe() as a sanity check

In [279]:
house.describe()

Unnamed: 0,disclosure_year,amount
count,13611.0,13611.0
mean,2020.630593,70733.23
std,0.703754,666269.0
min,2020.0,15000.0
25%,2020.0,15000.0
50%,2020.0,15000.0
75%,2021.0,15000.0
max,2022.0,25000000.0


It appears our $50,000,000 transaction disappeared -- it presumably didn't have a ticker.

We need our columns to agree with the Senate dataframe columns, so we made the following changes. While we erred on the side of keeping information for now, we dropped the 'ptr_link' because we don't anticipate doing any analysis on it. If we need it, we'll come back and reference it from the csv or from prior to this point.

This [stackoverflow answer](https://stackoverflow.com/a/54195568) helped us to find the differences between the lists of columns 

In [280]:
house_cols = list(house.columns)

senate_cols = ['transaction_date', 'owner', 'ticker', 'asset_description',
       'asset_type', 'type', 'amount', 'comment', 'senator', 'ptr_link',
       'disclosure_date', 'year'] 

house_different_cols = list(set(house_cols).difference(senate_cols))
print(f'In House, not in Senate: {house_different_cols}')

senate_different_cols = list(set(senate_cols).difference(house_cols))
print(f'In Senate, not in House: {senate_different_cols}')

In House, not in Senate: ['disclosure_year', 'district', 'representative', 'cap_gains_over_200_usd']
In Senate, not in House: ['owner', 'senator', 'year', 'comment', 'asset_type']


In [281]:
house['chamber'] = 'house'
house.drop(columns = ['ptr_link', 'cap_gains_over_200_usd', 'disclosure_year'], inplace = True)
house.rename(columns = {'representative': 'name', 'district': 'represents'}, inplace = True)

In [282]:
house_cols = list(house.columns)

senate_cols = ['transaction_date', 'owner', 'ticker', 'asset_description',
       'asset_type', 'type', 'amount', 'comment', 'senator', 'ptr_link',
       'disclosure_date', 'year'] 

house_different_cols = list(set(house_cols).difference(senate_cols))
print(f'In House, not in Senate: {house_different_cols}')

senate_different_cols = list(set(senate_cols).difference(house_cols))
print(f'In Senate, not in House: {senate_different_cols}')

In House, not in Senate: ['chamber', 'represents', 'name']
In Senate, not in House: ['owner', 'senator', 'year', 'comment', 'asset_type', 'ptr_link']


The following two data sets came from [this github](https://github.com/unitedstates/congress-legislators/).

In [283]:
current_legislators = pd.read_csv('./data/legislators-current.csv')
current_legislators.shape

(537, 34)

In [284]:
current_leg_cols = list(current_legislators.columns)
current_leg_cols

['last_name',
 'first_name',
 'middle_name',
 'suffix',
 'nickname',
 'full_name',
 'birthday',
 'gender',
 'type',
 'state',
 'district',
 'senate_class',
 'party',
 'url',
 'address',
 'phone',
 'contact_form',
 'rss_url',
 'twitter',
 'facebook',
 'youtube',
 'youtube_id',
 'bioguide_id',
 'thomas_id',
 'opensecrets_id',
 'lis_id',
 'fec_ids',
 'cspan_id',
 'govtrack_id',
 'votesmart_id',
 'ballotpedia_id',
 'washington_post_id',
 'icpsr_id',
 'wikipedia_id']

In [285]:
historical_legislators = pd.read_csv('./data/legislators-historical.csv')
historical_legislators.shape

(12056, 34)

In [286]:
historical_leg_cols = list(historical_legislators.columns)
historical_leg_cols

['last_name',
 'first_name',
 'middle_name',
 'suffix',
 'nickname',
 'full_name',
 'birthday',
 'gender',
 'type',
 'state',
 'district',
 'senate_class',
 'party',
 'url',
 'address',
 'phone',
 'contact_form',
 'rss_url',
 'twitter',
 'facebook',
 'youtube',
 'youtube_id',
 'bioguide_id',
 'thomas_id',
 'opensecrets_id',
 'lis_id',
 'fec_ids',
 'cspan_id',
 'govtrack_id',
 'votesmart_id',
 'ballotpedia_id',
 'washington_post_id',
 'icpsr_id',
 'wikipedia_id']

In [287]:
current_diff_cols = list(set(current_leg_cols).difference(historical_leg_cols))
current_diff_cols

[]

The above determines the columns are the same. We shrank the historical ones to eliminate irrelevant entries. Making birthdays datetime. Referenced [this site](https://www.w3schools.com/python/python_datetime.asp).

In [288]:
historical_legislators['birthday'] = pd.to_datetime(historical_legislators['birthday'], format = '%Y-%m-%d')
current_legislators['birthday'] = pd.to_datetime(current_legislators['birthday'], format = '%Y-%m-%d')
historical_legislators = historical_legislators[historical_legislators['birthday'] > datetime(1922,1, 1)]

legislators = pd.concat([current_legislators, historical_legislators])
legislators = legislators[['last_name', 'first_name', 'middle_name', 'suffix',
                                             'full_name', 'birthday', 'gender', 'type', 'state', 'party', 'district']]

In [289]:
legislators.dtypes

last_name              object
first_name             object
middle_name            object
suffix                 object
full_name              object
birthday       datetime64[ns]
gender                 object
type                   object
state                  object
party                  object
district              float64
dtype: object

The following is to build a first_name and last_name column in the house dataframe to make it easier to search for the representative in the legislators dataframe. This worked for the fast majority of representatives/instances, but there were a few representatives for whom it identified the incorrect first and/or last name, as well as some dirty data (i.e. 'None' at the start of the name, which became the first name). That's cleaned below.

In [372]:
house.columns

Index(['disclosure_date', 'transaction_date', 'ticker', 'asset_description',
       'type', 'amount', 'name', 'represents', 'chamber', 'party', 'birthday',
       'gender'],
      dtype='object')

In [290]:
house['name'] = house['name'].map(lambda x: x.replace("Hon. ", ""))
house['first_name'] = house['name'].map(lambda x: x.split()[0])
house['last_name'] = house['name'].map(lambda x: x.split()[-1])

There were quite a number of representatives that needed special treatment to get the data from the relevant_representatives dataframe into the house dataframe. 

They were: {'Aston McEachin', 'Christopher Jacobs', 'Cindy Axne', 'Daniel Crenshaw', 'David Cawthorn', 'Debbie Schultz', 'Felix Moore', 'Greg Murphy','Greg Steube','James Banks','James Costa','James Hagedorn', 'James Hill', 'Kenneth Buck', 'Linda Sanchez', 'Michael Gallagher', 'Michael Garcia', 'Mr. Cox', 'Mr. Franklin', 'Mr. Meijer', 'Mrs. Greene', 'Neal FACS', 'Nicholas Taylor', 'None Arenholz', 'None Jacobs', 'None Manning', 'None Newman', 'None Ross', 'None Spartz', 'Richard Allen', 'Rohit Khanna', 'S. Krishnamoorthi', 'Scott Franklin', "Tom O'Halleran"}

We added the second 'if' statement within the for-loop to use last name and district number if first and last name didn't work. For several of the reps this still didn't work. The rewrites to the two dataframes that appear before the for-loop were necessary to address those errors.

Ben Peck was instrumental to solving the error that the for-loop threw when those names were incorrect, and helped with finding and fixing the specific errors at office hours 7/19/22.

In [291]:
house.loc[house['last_name'] == 'FACS', 'last_name'] = 'Dunn'
house.loc[house['last_name'] == 'Arenholz', 'last_name'] = 'Hinson'

legislators.loc[legislators['last_name'].str.contains('Halleran'), 'last_name'] = "O'Halleran"
legislators.loc[legislators['last_name'] == 'Sánchez', 'last_name'] = 'Sanchez'

In [292]:
house['party'] = ''
house['birthday'] = ''
house['gender'] = ''

name = set()
for i in house.index:
    first_name = house.loc[i, 'first_name']
    last_name = house.loc[i, 'last_name']
    party = legislators[(legislators['first_name'] == first_name) & (legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['party'].values
    birthday = legislators[(legislators['first_name'] == first_name) & (legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['birthday'].values
    gender = legislators[(legislators['first_name'] == first_name) & (legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['gender'].values
    if len(party) == 0:
        party = legislators[(legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['party'].values
        birthday = legislators[(legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['birthday'].values
        gender = legislators[(legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['gender'].values
    if len(party) == 0:
        name.add(str(first_name) + ' ' + str(last_name))
    house.loc[i, 'party'] = party[0]
    house.loc[i, 'birthday'] = birthday[0]
    house.loc[i, 'gender'] = gender[0]
print(len(name))    

0


In [293]:
house['birthday'] = pd.to_datetime(house['birthday'], format = '%Y-%m-%d')
house.drop(columns = ['first_name', 'last_name'], inplace = True)

# Senate

In [294]:
senate = pd.read_csv('data/all_transactions_senate.csv')

# Datetimes

In [295]:
senate['transaction_date'] = pd.to_datetime(senate['transaction_date'])
senate['disclosure_date'] = pd.to_datetime(senate['disclosure_date'])
senate['transaction_date'] = pd.to_datetime(senate['transaction_date'], format = '%Y-%m-%d')
senate['disclosure_date'] = pd.to_datetime(senate['disclosure_date'], format = '%Y-%m-%d')
senate.dtypes

transaction_date     datetime64[ns]
owner                        object
ticker                       object
asset_description            object
asset_type                   object
type                         object
amount                       object
comment                      object
senator                      object
ptr_link                     object
disclosure_date      datetime64[ns]
dtype: object

# Checking Years
Now let's create a year column

In [296]:
senate['year'] = senate['transaction_date'].map(lambda x: x.year)
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date,year
0,2022-06-21,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate/Coupon:</em> 3.625%<br> <em>Matures:</em> 01/15/2024</div>",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a6ce1670-29e5-4b70-bc24-f77d4ce7dff5/,2022-07-05,2022
1,2022-06-15,Spouse,--,"Office Properties Income Trust <div class=""text-muted""><em>Rate/Coupon:</em> 4.0%<br> <em>Matures:</em> 07/15/2022</div>",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a6ce1670-29e5-4b70-bc24-f77d4ce7dff5/,2022-07-05,2022
2,2022-05-17,Spouse,--,"Lee County Florida Health Care Facilities Revenue Bond <div class=""text-muted""><em>Rate/Coupon:</em> 5.75%<br> <em>Matures:</em> 10/01/2042</div>",Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8ddf0001-ba8d-4c25-b970-556e8d44149c/,2022-06-16,2022
3,2022-06-03,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5ed49813-1cfe-42fc-8235-9da7d7a21051/,2022-06-13,2022
4,2022-05-31,Joint,X,"United States Steel Corporation Common Stock <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/40602988-914b-49a7-b7d7-e82d7b0364b2/,2022-06-13,2022


In [297]:
senate['year'].value_counts()

2020    1535
2018    1395
2017    1372
2015    1152
2019    1037
2016     977
2014     727
2021     612
2022     347
2013     184
2012      60
Name: year, dtype: int64

In [298]:
senate = senate.drop(columns=['year'])
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
0,2022-06-21,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate/Coupon:</em> 3.625%<br> <em>Matures:</em> 01/15/2024</div>",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a6ce1670-29e5-4b70-bc24-f77d4ce7dff5/,2022-07-05
1,2022-06-15,Spouse,--,"Office Properties Income Trust <div class=""text-muted""><em>Rate/Coupon:</em> 4.0%<br> <em>Matures:</em> 07/15/2022</div>",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a6ce1670-29e5-4b70-bc24-f77d4ce7dff5/,2022-07-05
2,2022-05-17,Spouse,--,"Lee County Florida Health Care Facilities Revenue Bond <div class=""text-muted""><em>Rate/Coupon:</em> 5.75%<br> <em>Matures:</em> 10/01/2042</div>",Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8ddf0001-ba8d-4c25-b970-556e8d44149c/,2022-06-16
3,2022-06-03,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5ed49813-1cfe-42fc-8235-9da7d7a21051/,2022-06-13
4,2022-05-31,Joint,X,"United States Steel Corporation Common Stock <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/40602988-914b-49a7-b7d7-e82d7b0364b2/,2022-06-13


# Nulls
Let's look at null values

In [299]:
senate.isnull().sum()

transaction_date       0
owner                465
ticker               465
asset_description      0
asset_type           666
type                 465
amount                 0
comment              465
senator                0
ptr_link               0
disclosure_date        0
dtype: int64

# Owner

We determined we don't need owner because for our purposes whether the congressperson, spouse, etc., own the asset is irrelevant to whether or not insider invformation was used

In [300]:
senate.drop(columns = ['owner'], inplace = True)
senate.isnull().sum()

transaction_date       0
ticker               465
asset_description      0
asset_type           666
type                 465
amount                 0
comment              465
senator                0
ptr_link               0
disclosure_date        0
dtype: int64

## Asset Type Nulls
A number of the nulls actually have the information we need in other columns. That said, some asset types aren't helpful to our analysis and eliminating them here removes some other NaNs.

In [301]:
senate[senate['asset_type'].isnull()].head(10)

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
7370,2014-12-30,--,MDLZ (stock option),,Sale (Full),"$50,001 - $100,000",--,Roy Blunt,https://efdsearch.senate.gov/search/view/ptr/f24dd495-dcee-45a7-9138-a29d7c72f4f8/,2015-12-24
8479,2014-12-18,TJX,"The TJX Companies, Inc. (NYSE)",,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8480,2014-12-18,ORCL,Oracle Corporation (NYSE),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8481,2014-12-18,T,"AT&amp;T, Inc. (NYSE)",,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8482,2014-12-18,KORS,Michael Kors Holdings Limited (NYSE),,Sale (Partial),"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8483,2014-11-25,PCAR,PACCAR Inc. (NASDAQ),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8484,2014-11-25,IBM,International Business Machines Corporation (NYSE),,Sale (Partial),"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8485,2014-12-18,CMRE,Costamare Inc. (NYSE),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8486,2014-11-25,PCAR,PACCAR Inc. (NASDAQ),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8487,2014-11-25,PRU,"Prudential Financial, Inc. (NYSE)",,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05


In [302]:
senate['asset_type'].value_counts(dropna = False)

Stock                           6756
NaN                              666
PDF Disclosed Filing             465
Other Securities                 454
Municipal Security               407
Stock Option                     253
Corporate Bond                   246
Non-Public Stock                 100
Commodities/Futures Contract      48
Cryptocurrency                     3
Name: asset_type, dtype: int64

In [303]:
senate[senate['asset_type'] == 'Municipal Security'].head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
2,2022-05-17,--,"Lee County Florida Health Care Facilities Revenue Bond <div class=""text-muted""><em>Rate/Coupon:</em> 5.75%<br> <em>Matures:</em> 10/01/2042</div>",Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8ddf0001-ba8d-4c25-b970-556e8d44149c/,2022-06-16
30,2022-04-21,--,"Portland ME 736560TV5 <div class=""text-muted""><em>Rate/Coupon:</em> 5.000%<br> <em>Matures:</em> 05/01/2025</div>",Municipal Security,Purchase,"$50,001 - $100,000",--,Susan M Collins,https://efdsearch.senate.gov/search/view/ptr/6c81e400-b184-4eb3-80e7-eb91f8b5a609/,2022-05-16
54,2022-04-12,--,"Jacksonville Florida Transportation Revenue Bond <div class=""text-muted""><em>Rate/Coupon:</em> 5.0%<br> <em>Matures:</em> 10/01/2027</div>",Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/5409750e-79c5-48c6-9015-b3d7523c2f18/,2022-05-05
55,2022-04-06,--,"Broward County Florida Airport System Revenue Bond <div class=""text-muted""><em>Rate/Coupon:</em> 5.0%<br> <em>Matures:</em> 10/01/2026</div>",Municipal Security,Purchase,"$250,001 - $500,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/5409750e-79c5-48c6-9015-b3d7523c2f18/,2022-05-05
56,2022-04-06,--,"Broward County Florida Airport System Revenue Bond <div class=""text-muted""><em>Rate/Coupon:</em> 5.0%<br> <em>Matures:</em> 10/01/2026</div>",Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/5409750e-79c5-48c6-9015-b3d7523c2f18/,2022-05-05


In [304]:
len(senate[ senate['asset_description'] == 'This filing was disclosed via scanned PDF. Use link in ptr_link column to view the PDF.'])

465

In [305]:
senate[senate['asset_type'] == 'Other Securities'].head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
13,2022-05-16,ABYIX,Abbey Capital Futures Strategy Fund- Class I Sha,Other Securities,Purchase,"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1faaea57-05b9-4b83-872b-7e6d806c5c56/,2022-06-03
14,2022-05-16,GSMYX,Goldman Sachs Small/Mid-Cap Growth Fund Inst Cl,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1faaea57-05b9-4b83-872b-7e6d806c5c56/,2022-06-03
15,2022-05-16,--,FIMKX - Fidelity Advisor Focused Emerging Markets I,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1faaea57-05b9-4b83-872b-7e6d806c5c56/,2022-06-03
16,2022-05-16,FCPIX,Fidelity Advisor International Capital Appreciat,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1faaea57-05b9-4b83-872b-7e6d806c5c56/,2022-06-03
18,2022-05-10,LUBYX,Lord Abbett Ultra Short Bond Fund - Class I,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1faaea57-05b9-4b83-872b-7e6d806c5c56/,2022-06-03


In [306]:
senate[senate['asset_type'] == 'PDF Disclosed Filing'].tail()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
9393,2012-08-17,,This filing was disclosed via scanned PDF. Use link in ptr_link column to view the PDF.,PDF Disclosed Filing,,Unknown,,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/paper/5221D7D8-15D0-40FE-A64C-DD242311F0AE/,2012-08-17
9394,2012-08-16,,This filing was disclosed via scanned PDF. Use link in ptr_link column to view the PDF.,PDF Disclosed Filing,,Unknown,,Pat Roberts,https://efdsearch.senate.gov/search/view/paper/5C666F29-7055-461D-B4F7-5EA73AFCD860/,2012-08-16
9395,2012-08-15,,This filing was disclosed via scanned PDF. Use link in ptr_link column to view the PDF.,PDF Disclosed Filing,,Unknown,,Robert J Portman,https://efdsearch.senate.gov/search/view/paper/0D78FC31-D28A-440A-8E2D-65C4D9861EAB/,2012-08-15
9396,2012-08-02,,This filing was disclosed via scanned PDF. Use link in ptr_link column to view the PDF.,PDF Disclosed Filing,,Unknown,,Thomas R Carper,https://efdsearch.senate.gov/search/view/paper/CFDE3B80-E8BD-4F2D-9D64-E892C5EFB32A/,2012-08-02
9397,2012-07-25,,This filing was disclosed via scanned PDF. Use link in ptr_link column to view the PDF.,PDF Disclosed Filing,,Unknown,,Benjamin L Cardin,https://efdsearch.senate.gov/search/view/paper/CDFDAF62-18EA-4298-B0C5-62085A6EC3CD/,2012-07-25


We know right now that Municipal Security, Corporate Bond, Non-Public Stock, Commodities/Futures Contract, and Cryptocurrency represent transactions that don't provide information that's helpful to our model/planned analysis. PDF Disclosed Filing are basically null, and while it's possible to look at the PDFs and enter the data, we don't have the time for that on this project, unfortunately, and the fraction of instances that represents is small.

In [307]:
len(senate)

9398

We found this repeat approach to be best because of the difficulties of continuing to include the NaNs.

In [308]:
senate = senate[senate['asset_type'] != 'Municipal Security']
senate = senate[senate['asset_type'] != 'Corporate Bond']
senate = senate[senate['asset_type'] != 'Non-Public Stock']
senate = senate[senate['asset_type'] != 'Commodities/Futures Contract']
senate = senate[senate['asset_type'] != 'Cryptocurrency']
senate = senate[senate['asset_type'] != 'PDF Disclosed Filing']
len(senate)

8129

In [309]:
senate['asset_type'].value_counts(dropna = False)

Stock               6756
NaN                  666
Other Securities     454
Stock Option         253
Name: asset_type, dtype: int64

In [310]:
senate.isnull().sum()

transaction_date       0
ticker                 0
asset_description      0
asset_type           666
type                   0
amount                 0
comment                0
senator                0
ptr_link               0
disclosure_date        0
dtype: int64

This method results in all other NaNs being removed -- this is largely if not entirely because of the 'PDF' asset type.

# Type
We're dropping Exchanges because we can't work with them for our analysis. We also made the values here match the values from the House database in formate.

In [311]:
senate['type'].value_counts()

Purchase          4033
Sale (Full)       2180
Sale (Partial)    1824
Exchange            92
Name: type, dtype: int64

In [312]:
senate = senate[senate['type'] != 'Exchange']
senate['type'].value_counts()

Purchase          4033
Sale (Full)       2180
Sale (Partial)    1824
Name: type, dtype: int64

In [313]:
senate['type'] = senate['type'].map({
    'Purchase': 'purchase',
    'Sale (Full)': 'sale_full',
    'Sale (Partial)': 'sale_partial'
                                    })
senate['type'].value_counts()

purchase        4033
sale_full       2180
sale_partial    1824
Name: type, dtype: int64

# Ticker

In [314]:
senate['ticker'].value_counts()

--                    1047
AAPL                   177
BAC                     86
MSFT                    85
NFLX                    79
PFE                     76
DISCA                   74
DIS                     73
T                       70
FEYE                    67
FDC                     66
URBN                    65
CZR                     63
FB                      58
AMZN                    55
NVDA                    54
WFC                     51
GE                      49
MRK                     45
PG                      45
XOM                     44
WMT                     44
WPX                     44
CVS                     43
DD                      43
GM                      43
BA                      42
HBI                     42
CSCO                    42
FDX                     41
GPK                     40
INTC                    40
CVX                     39
MOS                     39
GLW                     39
BWXT                    39
GILD                    38
S

The following removes a few items that cause errors in finding the symbol from the ticker dataframe.

In [315]:
senate['asset_description'] = senate['asset_description'].str.replace(' &amp;', '')
senate['asset_description'] = senate['asset_description'].str.replace(' Common Stock', '')
senate['asset_description'] = senate['asset_description'].str.replace(' CMN', '')
senate['asset_description'] = senate['asset_description'].str.replace(' ETF', '')

In [316]:
senate['ticker'].value_counts()[0]

1047

In [317]:
count = 0
name_set = set()
symbol_set = set()
for i in senate.loc[senate['ticker'] == '--']['asset_description']:
    if i in tickers['name'].unique():
        count +=1
        name_set.add(i)
        symbol = tickers.loc[tickers['name'] == i, 'symbol'].values
        #print(symbol)
        symbol_set.add(symbol[0])
        senate.loc[((senate['ticker'] == '--') & (senate['asset_description'] == i)), 'ticker'] = symbol[0]
print(count)
print(name_set)
print(symbol_set)

101
{'Fifth Third Bancorp', 'Precision Castparts Corp', 'Zimmer Holdings, Inc.', 'Bristol-Myers Squibb Company', "Cabela's Inc", 'nflx', 'Amazon', 'Target Corporation', 'Tesoro Logistics LP', 'CIT Group Inc (CIT)', 'Costco Wholesale Corporation', 'General Electric Company', 'SPDR Euro Stoxx 50', 'Under Armour Inc Cl A', 'Phillips 66', 'Under Armour Inc', 'Apple Inc.', 'Wal-Mart Stores, Inc.', 'Nike Inc B', 'Amazon.com, Inc.', 'Lincoln National Corporation', 'Alliant Energy Corp', 'Pfizer Inc', 'Altria Group', 'Verizon Communications Inc.', 'Global X Lithium Battery', 'ConocoPhillips', '3M Company', 'Quest Diagnostics Incorporated', 'Weight Watchers Intl Inc', 'PHLD - Phillips Edison Grocery Center REIT I', 'Vector Group Ltd.', 'aapl'}
{'UAA', 'GE', 'COST', 'LIT', 'DGX', 'COP', 'MO', 'NFLX', 'CAB', 'TGT', 'FEZ', 'ZMH', 'PHLD', 'WW', 'PSX', 'VGR', 'BMY', 'CIT', 'PCP', 'WMT', 'TLLP', 'VZ', 'FITB', 'AMZN', 'LNC', 'LNT', 'AAPL', 'NKE', 'MMM', 'PFE'}


In [318]:
senate['ticker'].value_counts()

--                    946
AAPL                  185
BAC                    86
MSFT                   85
PFE                    83
NFLX                   83
DISCA                  74
DIS                    73
T                      70
FEYE                   67
FDC                    66
URBN                   65
CZR                    63
AMZN                   61
FB                     58
NVDA                   54
WFC                    51
GE                     50
MRK                    45
PG                     45
WMT                    45
WPX                    44
XOM                    44
GM                     43
CVS                    43
DD                     43
HBI                    42
BA                     42
CSCO                   42
FDX                    41
GPK                    40
INTC                   40
CVX                    39
BWXT                   39
MOS                    39
GLW                    39
SBUX                   38
PYPL                   38
QCOM        

In [319]:
senate['possible_ticker'] = ''

for i in senate[senate['ticker'] == '--']['asset_description'].index:
    asset_descr = senate.loc[i, 'asset_description']
    #print(asset_descr)
    ticker = re.findall('[A-Z]{2,6}', asset_descr)
    #print(ticker)
    if len(ticker) == 1 and ticker != 'LLC' and ticker != 'ETF':
        senate.loc[i, 'possible_ticker'] = ticker[0]

senate['possible_ticker'].value_counts(dropna = False)

          7570
JPM         18
SPDR        17
WMT         12
WFM         11
ETE         10
UBS         10
GE           9
CVC          9
MA           8
HSBC         8
MSCI         7
MS           7
SXL          7
BAX          6
NFLX         6
PFS          6
AAPL         6
US           5
LNT          5
WLK          5
NY           5
NGLS         5
LYV          4
CSCO         4
DVN          4
MOS          4
PLC          4
ADR          4
MLP          4
PG           4
AQR          4
CA           4
GILD         4
MSFT         3
WPX          3
KN           3
KMI          3
GLW          3
SE           3
DISCA        3
QCOM         3
XOM          3
CVS          3
AVP          3
BK           3
EI           3
BIIB         3
CRMT         2
SPY          2
LP           2
SYY          2
AB           2
VIAB         2
FOXA         2
TJX          2
FEYE         2
ESV          2
ORCL         2
VFC          2
LQDT         2
PAGP         2
CVX          2
USG          2
ADT          2
CBS          2
AXLL      

In [320]:
#ticker['possible_in_tickers_df'] = 0

for i in senate.loc[senate['ticker'] == '--']['possible_ticker']:
    if i in tickers['symbol'].unique():
        #count +=1
        #name_set.add(i)
        symbol = tickers.loc[tickers['symbol'] == i, 'symbol'].values
        #print(symbol)
        #symbol_set.add(symbol[0])
        senate.loc[((senate['ticker'] == '--') & (senate['possible_ticker'] == i)), 'ticker'] = symbol[0]

In [321]:
senate['ticker'].value_counts(dropna = False)[:3]

--      601
AAPL    191
NFLX     89
Name: ticker, dtype: int64

In [322]:
senate[senate['ticker'] == '--']['asset_description'].value_counts(dropna = False)

Israel Bond                                                                                                                                                                                                                                                                                                                        7
Revival LOC, LLC <div class="text-muted"> <em>Company:</em> Revival LOC, LLC &nbsp;(Springfield, NJ) </div> <div class="text-muted"><em>Description:</em>&nbsp;Line of Credit - Manufacturer</div>                                                                                                                                 5
Ishares China Large Cap                                                                                                                                                                                                                                                                                                            4
Liberty Partners, LLC <di

In [323]:
#dropping 'possible_ticker'

senate.drop(columns = 'possible_ticker', inplace = True)
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
3,2022-06-03,WFC,Wells Fargo Company,Stock,purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5ed49813-1cfe-42fc-8235-9da7d7a21051/,2022-06-13
4,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",Stock Option,sale_partial,"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/40602988-914b-49a7-b7d7-e82d7b0364b2/,2022-06-13
5,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Call <br><em>Strike price:</em> $30.00 <br> <em>Expires:</em> 01/20/2023 </div>",Stock Option,sale_full,"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/40602988-914b-49a7-b7d7-e82d7b0364b2/,2022-06-13
6,2022-05-31,X,United States Steel Corporation,Stock,purchase,"$100,001 - $250,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/40602988-914b-49a7-b7d7-e82d7b0364b2/,2022-06-13
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",Stock,purchase,"$50,001 - $100,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/40602988-914b-49a7-b7d7-e82d7b0364b2/,2022-06-13


In [324]:
senate['ticker'].value_counts().sum()

8037

We have 8037 rows

In [325]:
(601/8933) * 100

6.727862979961939

At this point, about 7% of data has no ticker. We accept the loss of that data because there's nothing left we can do without getting even more time intensive.

In [326]:
senate = senate[senate['ticker'] != '--']

senate['ticker'].value_counts(dropna = False)[:10]

AAPL     191
NFLX      89
MSFT      88
BAC       87
PFE       84
DISCA     77
DIS       75
T         70
FEYE      67
FDC       66
Name: ticker, dtype: int64

# Null Asset Type

In [327]:
senate['asset_type'].value_counts(dropna = False)

Stock               6354
NaN                  577
Other Securities     254
Stock Option         251
Name: asset_type, dtype: int64

In [328]:
senate[senate['asset_type'].isnull()].head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
8479,2014-12-18,TJX,"The TJX Companies, Inc. (NYSE)",,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8480,2014-12-18,ORCL,Oracle Corporation (NYSE),,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8481,2014-12-18,T,"AT&amp;T, Inc. (NYSE)",,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8482,2014-12-18,KORS,Michael Kors Holdings Limited (NYSE),,sale_partial,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05
8483,2014-11-25,PCAR,PACCAR Inc. (NASDAQ),,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/332020b5-647b-49c1-afa2-7be502f72c34/,2015-01-05


## Eliminating Remaining Columns
At this point it appears the remaining nulls in asset_type correspond to otherwise good columns, so we eliminated that and other columns we don't need.

In [329]:
senate = senate.drop(columns=['asset_type', 'comment', 'ptr_link'])
senate.isnull().sum()

transaction_date     0
ticker               0
asset_description    0
type                 0
amount               0
senator              0
disclosure_date      0
dtype: int64

Let's drop any null rows

In [330]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7436 entries, 3 to 9170
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7436 non-null   datetime64[ns]
 1   ticker             7436 non-null   object        
 2   asset_description  7436 non-null   object        
 3   type               7436 non-null   object        
 4   amount             7436 non-null   object        
 5   senator            7436 non-null   object        
 6   disclosure_date    7436 non-null   datetime64[ns]
dtypes: datetime64[ns](2), object(5)
memory usage: 464.8+ KB


# `'amount'`

In [331]:
senate['amount'].value_counts()

$1,001 - $15,000             5347
$15,001 - $50,000            1314
$50,001 - $100,000            456
$100,001 - $250,000           249
$250,001 - $500,000            40
$500,001 - $1,000,000          11
$1,000,001 - $5,000,000        11
$5,000,001 - $25,000,000        6
Over $50,000,000                1
$25,000,001 - $50,000,000       1
Name: amount, dtype: int64

We see that one of the Senators invested over 50 million dollars in 1 trade! Let's see who it was

In [332]:
senate[senate['amount'] == "Over $50,000,000"]['senator']

7606    James M Inhofe
Name: senator, dtype: object

## Making the Amounts into the High End and Integers

In [333]:
senate['amount'] = senate['amount'].str.split(' ') 

for i in senate.index:
    if len(senate.loc[i, 'amount']) == 3:
        senate.loc[i, 'amount'] = senate.loc[i, 'amount'][2]
    else:
        senate.loc[i, 'amount'] = senate.loc[i, 'amount'][1]
        
senate['amount'] = senate['amount'].map(lambda x: x.replace('$', '').replace(',', ''))
senate['amount'] = senate['amount'].astype(int)
senate['amount'].value_counts()

15000       5347
50000       1314
100000       456
250000       249
500000        40
1000000       11
5000000       11
25000000       6
50000000       2
Name: amount, dtype: int64

For some reason that didn't change all the values. We will change the rest manually

In [334]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,senator,disclosure_date
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,"A. Mitchell Mcconnell, Jr.",2022-06-13
4,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_partial,50000,Thomas H Tuberville,2022-06-13
5,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Call <br><em>Strike price:</em> $30.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_full,50000,Thomas H Tuberville,2022-06-13
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13


In [335]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7436 entries, 3 to 9170
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7436 non-null   datetime64[ns]
 1   ticker             7436 non-null   object        
 2   asset_description  7436 non-null   object        
 3   type               7436 non-null   object        
 4   amount             7436 non-null   int64         
 5   senator            7436 non-null   object        
 6   disclosure_date    7436 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 722.8+ KB


# Column Names and Chamber Column
Now I will rename the senator column to be 'name' and make a column named 'chamber' where the value is all senator

In [336]:
senate.rename(columns = {'senator':'name'}, inplace = True)
senate['chamber'] = 'senate'

In [337]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,"A. Mitchell Mcconnell, Jr.",2022-06-13,senate
4,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate
5,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Call <br><em>Strike price:</em> $30.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_full,50000,Thomas H Tuberville,2022-06-13,senate
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate


# State, Birthday, Gender
Because of the changes in the dataframe, the dictionary is missing some people. I'm going to see if the House workflow works. 

In [339]:
legislators.head()

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
0,Brown,Sherrod,,,Sherrod Brown,1952-11-09,M,sen,OH,Democrat,
1,Cantwell,Maria,,,Maria Cantwell,1958-10-13,F,sen,WA,Democrat,
2,Cardin,Benjamin,L.,,Benjamin L. Cardin,1943-10-05,M,sen,MD,Democrat,
3,Carper,Thomas,Richard,,Thomas R. Carper,1947-01-23,M,sen,DE,Democrat,
4,Casey,Robert,P.,Jr.,"Robert P. Casey, Jr.",1960-04-13,M,sen,PA,Democrat,


In [340]:
legislators[legislators['full_name'] == 'Thomas H Tuberville']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district


In [341]:
legislators[legislators['full_name'] == 'A. Mitchell Mcconnell, Jr.']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district


In [342]:
legislators[legislators['last_name'] == 'Tuberville']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
463,Tuberville,Tommy,Hawley,,Tommy Tuberville,1954-09-18,M,sen,AL,Republican,


In [343]:
legislators[legislators['last_name'] == 'McConnell']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
19,McConnell,Mitch,,,Mitch McConnell,1942-02-20,M,sen,KY,Republican,


In [344]:
senate['name'] = senate['name'].map(lambda x: x.replace(', Jr.', ''))
senate['name'] = senate['name'].map(lambda x: x.replace(', Jr', ''))
senate['name'] = senate['name'].map(lambda x: x.replace(', Iv', ''))
senate['name'] = senate['name'].map(lambda x: x.replace(', Iii', ''))
senate['name'] = senate['name'].map(lambda x: x.replace('Jerry Moran,', 'Jerry Moran'))

In [345]:
senate['first_name'] = senate['name'].map(lambda x: x.split()[0])
senate['last_name'] = senate['name'].map(lambda x: x.split()[-1])
senate.loc[senate['first_name']== 'A.', 'first_name'] = 'Mitchell'
senate.loc[senate['last_name']== 'Mcconnell', 'last_name'] = 'McConnell'
senate.loc[senate['last_name']== 'Hollen', 'last_name'] = 'Van Hollen'

In [346]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell
4,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville
5,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Call <br><em>Strike price:</em> $30.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville


In [347]:
senate[senate['first_name'] == 'A.']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name


In [348]:
senate[senate['last_name'] == 'Jr.']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name


In [349]:
senate[senate['last_name'] == 'Manchin,']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name


In [350]:
len(senate['last_name'].value_counts().keys())

48

In [351]:
len(senate['first_name'].value_counts().keys())

38

In [352]:
senate['party'] = ''
senate['birthday'] = ''
senate['gender'] = ''
senate['represents'] = ''

count = 0
name = set()
for i in senate.index:

    last_name = senate.loc[i, 'last_name']
    party = legislators[(legislators['last_name'] == last_name)]['party'].values #(legislators['first_name'] == first_name) & 
    birthday = legislators[(legislators['last_name'] == last_name)]['birthday'].values #(legislators['first_name'] == first_name) & 
    gender = legislators[(legislators['last_name'] == last_name)]['gender'].values #(legislators['first_name'] == first_name) &
    state = legislators[(legislators['last_name'] == last_name)]['state'].values #(legislators['first_name'] == first_name) &
    if len(party) == 0:
        name.add(str(first_name) + ' ' + str(last_name))
    count += 1
    senate.loc[i, 'party'] = party[0]
    senate.loc[i, 'birthday'] = birthday[0]
    senate.loc[i, 'gender'] = gender[0]
    senate.loc[i, 'represents'] = state[0]

print(len(name))  

0


In [353]:
count

7436

In [354]:
name

set()

In [355]:
party

array(['Democrat', 'Republican'], dtype=object)

In [356]:
last_name

'Reed'

In [357]:
legislators[legislators['last_name'] == 'Van Hollen']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
198,Van Hollen,Chris,,,Chris Van Hollen,1959-01-10,M,sen,MD,Democrat,


In [358]:
senate[senate['last_name'] == 'Reed']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
3418,2019-02-28,XLNX,"Xilinx, Inc.",sale_full,50000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3419,2019-02-28,TMO,Thermo Fisher Scientific Inc.,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3420,2019-02-28,SYK,Stryker Corporation,sale_full,50000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3421,2019-02-28,SLB,Schlumberger Limited,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3422,2019-02-28,QCOM,QUALCOMM Incorporated,sale_full,50000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3423,2019-02-28,PEP,"PepsiCo, Inc.",sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3424,2019-02-28,MSFT,Microsoft Corporation,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3425,2019-02-28,ISRG,"Intuitive Surgical, Inc.",sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3426,2019-02-28,INTC,Intel Corporation,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI
3427,2019-02-28,IQV,IQVIA Holdings Inc.,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12T00:00:00.000000000,M,RI


In [359]:
senate['birthday'] = pd.to_datetime(senate['birthday'], format = '%Y-%m-%d')
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7436 entries, 3 to 9170
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7436 non-null   datetime64[ns]
 1   ticker             7436 non-null   object        
 2   asset_description  7436 non-null   object        
 3   type               7436 non-null   object        
 4   amount             7436 non-null   int64         
 5   name               7436 non-null   object        
 6   disclosure_date    7436 non-null   datetime64[ns]
 7   chamber            7436 non-null   object        
 8   first_name         7436 non-null   object        
 9   last_name          7436 non-null   object        
 10  party              7436 non-null   object        
 11  birthday           7436 non-null   datetime64[ns]
 12  gender             7436 non-null   object        
 13  represents         7436 non-null   object        
dtypes: datet

In [360]:
senate.columns

Index(['transaction_date', 'ticker', 'asset_description', 'type', 'amount',
       'name', 'disclosure_date', 'chamber', 'first_name', 'last_name',
       'party', 'birthday', 'gender', 'represents'],
      dtype='object')

In [361]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell,Republican,1942-02-20,M,KY
4,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
5,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Call <br><em>Strike price:</em> $30.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL


Finally, we will export the cleaned csv file.

In [362]:
all_reps = pd.concat([senate, house], axis = 0)

In [363]:
all_reps['transaction_date'] = pd.to_datetime(all_reps['transaction_date'], format = '%Y-%m-%d')
all_reps['disclosure_date'] = pd.to_datetime(all_reps['disclosure_date'], format = '%Y-%m-%d')
all_reps['birthday'] = pd.to_datetime(all_reps['birthday'], format = '%Y-%m-%d')

In [364]:
all_reps.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell,Republican,1942-02-20,M,KY
4,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
5,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Call <br><em>Strike price:</em> $30.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL


In [365]:
all_reps.tail()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
14850,2020-04-09,SWK,"Stanley Black & Decker, Inc.",sale_partial,15000,Ed Perlmutter,2020-06-10,house,,,Democrat,1953-05-01,M,CO07
14851,2020-04-09,USB,U.S. Bancorp,sale_partial,15000,Ed Perlmutter,2020-06-10,house,,,Democrat,1953-05-01,M,CO07
14852,2020-03-13,BMY,Bristol-Myers Squibb Company,sale_full,250000,Nicholas Van Taylor,2020-06-10,house,,,Republican,1972-08-01,M,TX03
14853,2020-03-13,LLY,Eli Lilly and Company,sale_full,1000000,Nicholas Van Taylor,2020-06-10,house,,,Republican,1972-08-01,M,TX03
14854,2020-03-13,DIS,Walt Disney Company,sale_full,500000,Nicholas Van Taylor,2020-06-10,house,,,Republican,1972-08-01,M,TX03


In [366]:
all_reps.reset_index(drop = True, inplace = True)
all_reps.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
0,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell,Republican,1942-02-20,M,KY
1,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Put <br><em>Strike price:</em> $23.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
2,2022-05-31,X,"United States Steel Corporation <div class=""text-muted"">Option Type: Call <br><em>Strike price:</em> $30.00 <br> <em>Expires:</em> 01/20/2023 </div>",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
3,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
4,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL


In [367]:
all_reps.shape

(21047, 14)

In [368]:
all_reps.dtypes

transaction_date     datetime64[ns]
ticker                       object
asset_description            object
type                         object
amount                        int64
name                         object
disclosure_date      datetime64[ns]
chamber                      object
first_name                   object
last_name                    object
party                        object
birthday             datetime64[ns]
gender                       object
represents                   object
dtype: object

In [369]:
all_reps.to_csv('data/cleaned_complete_congress_ data.csv', index = False)

In [370]:
tickers.to_csv('data/ticker_symbols.csv', index = False)

In [371]:
legislators.to_csv('data/relevant_legislators.csv')