In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import requests
from bs4 import BeautifulSoup
import re

In [2]:
pd.options.display.max_rows = None

In [3]:
proxies = {'https': 'http://127.0.0.1:7769'} #I need this on my computer

In [4]:
#taken from https://senatestockwatcher.com/api
senate = pd.read_csv('datasets/all_transactions_senate.csv')

In [5]:
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
0,6/21/2022,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate...",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022
1,6/15/2022,Spouse,--,"Office Properties Income Trust <div class=""tex...",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022
2,5/17/2022,Spouse,--,Lee County Florida Health Care Facilities Reve...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8...,6/16/2022
3,6/3/2022,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5...,6/13/2022
4,5/31/2022,Joint,X,United States Steel Corporation Common Stock <...,Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,6/13/2022


# Datetimes
Let's turn the dates into datetimes

In [6]:
senate['transaction_date'] = pd.to_datetime(senate['transaction_date'])
senate['disclosure_date'] = pd.to_datetime(senate['disclosure_date'])
senate.dtypes

transaction_date     datetime64[ns]
owner                        object
ticker                       object
asset_description            object
asset_type                   object
type                         object
amount                       object
comment                      object
senator                      object
ptr_link                     object
disclosure_date      datetime64[ns]
dtype: object

In [7]:
senate['transaction_date'] = pd.to_datetime(senate['transaction_date'], format = '%Y-%m-%d')
senate['disclosure_date'] = pd.to_datetime(senate['disclosure_date'], format = '%Y-%m-%d')
senate.dtypes

transaction_date     datetime64[ns]
owner                        object
ticker                       object
asset_description            object
asset_type                   object
type                         object
amount                       object
comment                      object
senator                      object
ptr_link                     object
disclosure_date      datetime64[ns]
dtype: object

# Checking Years
Now let's create a year column

In [8]:
senate['year'] = senate['transaction_date'].map(lambda x: x.year)
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date,year
0,2022-06-21,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate...",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,2022-07-05,2022
1,2022-06-15,Spouse,--,"Office Properties Income Trust <div class=""tex...",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,2022-07-05,2022
2,2022-05-17,Spouse,--,Lee County Florida Health Care Facilities Reve...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8...,2022-06-16,2022
3,2022-06-03,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5...,2022-06-13,2022
4,2022-05-31,Joint,X,United States Steel Corporation Common Stock <...,Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,2022-06-13,2022


In [9]:
senate['year'].value_counts()

2020    1535
2018    1395
2017    1372
2015    1152
2019    1037
2016     977
2014     727
2021     612
2022     347
2013     184
2012      60
Name: year, dtype: int64

Now we see how far back in terms of year, we can drop the column

In [10]:
senate = senate.drop(columns=['year'])
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
0,2022-06-21,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate...",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,2022-07-05
1,2022-06-15,Spouse,--,"Office Properties Income Trust <div class=""tex...",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,2022-07-05
2,2022-05-17,Spouse,--,Lee County Florida Health Care Facilities Reve...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8...,2022-06-16
3,2022-06-03,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5...,2022-06-13
4,2022-05-31,Joint,X,United States Steel Corporation Common Stock <...,Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,2022-06-13


# Nulls
Let's look at null values

In [11]:
senate.isnull().sum()

transaction_date       0
owner                465
ticker               465
asset_description      0
asset_type           666
type                 465
amount                 0
comment              465
senator                0
ptr_link               0
disclosure_date        0
dtype: int64

# Owner

We determined we don't need owner because for our purposes whether the congressperson, spouse, etc., own the asset is irrelevant to whether or not insider invformation was used

In [12]:
senate.drop(columns = ['owner'], inplace = True)
senate.isnull().sum()

transaction_date       0
ticker               465
asset_description      0
asset_type           666
type                 465
amount                 0
comment              465
senator                0
ptr_link               0
disclosure_date        0
dtype: int64

## Asset Type Nulls
A number of the nulls actually have the information we need in other columns. That said, some asset types aren't helpful to our analysis and eliminating them here removes some other NaNs.

In [13]:
senate[senate['asset_type'].isnull()].head(10)

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
7370,2014-12-30,--,MDLZ (stock option),,Sale (Full),"$50,001 - $100,000",--,Roy Blunt,https://efdsearch.senate.gov/search/view/ptr/f...,2015-12-24
8479,2014-12-18,TJX,"The TJX Companies, Inc. (NYSE)",,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8480,2014-12-18,ORCL,Oracle Corporation (NYSE),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8481,2014-12-18,T,"AT&amp;T, Inc. (NYSE)",,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8482,2014-12-18,KORS,Michael Kors Holdings Limited (NYSE),,Sale (Partial),"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8483,2014-11-25,PCAR,PACCAR Inc. (NASDAQ),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8484,2014-11-25,IBM,International Business Machines Corporation (N...,,Sale (Partial),"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8485,2014-12-18,CMRE,Costamare Inc. (NYSE),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8486,2014-11-25,PCAR,PACCAR Inc. (NASDAQ),,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8487,2014-11-25,PRU,"Prudential Financial, Inc. (NYSE)",,Purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05


In [14]:
senate['asset_type'].value_counts(dropna = False)

Stock                           6756
NaN                              666
PDF Disclosed Filing             465
Other Securities                 454
Municipal Security               407
Stock Option                     253
Corporate Bond                   246
Non-Public Stock                 100
Commodities/Futures Contract      48
Cryptocurrency                     3
Name: asset_type, dtype: int64

In [15]:
senate[senate['asset_type'] == 'Municipal Security'].head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
2,2022-05-17,--,Lee County Florida Health Care Facilities Reve...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8...,2022-06-16
30,2022-04-21,--,"Portland ME 736560TV5 <div class=""text-muted"">...",Municipal Security,Purchase,"$50,001 - $100,000",--,Susan M Collins,https://efdsearch.senate.gov/search/view/ptr/6...,2022-05-16
54,2022-04-12,--,Jacksonville Florida Transportation Revenue Bo...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/5...,2022-05-05
55,2022-04-06,--,Broward County Florida Airport System Revenue ...,Municipal Security,Purchase,"$250,001 - $500,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/5...,2022-05-05
56,2022-04-06,--,Broward County Florida Airport System Revenue ...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/5...,2022-05-05


In [16]:
len(senate[ senate['asset_description'] == 'This filing was disclosed via scanned PDF. Use link in ptr_link column to view the PDF.'])

465

In [17]:
senate[senate['asset_type'] == 'Other Securities'].head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
13,2022-05-16,ABYIX,Abbey Capital Futures Strategy Fund- Class I Sha,Other Securities,Purchase,"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1...,2022-06-03
14,2022-05-16,GSMYX,Goldman Sachs Small/Mid-Cap Growth Fund Inst Cl,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1...,2022-06-03
15,2022-05-16,--,FIMKX - Fidelity Advisor Focused Emerging Mark...,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1...,2022-06-03
16,2022-05-16,FCPIX,Fidelity Advisor International Capital Appreciat,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1...,2022-06-03
18,2022-05-10,LUBYX,Lord Abbett Ultra Short Bond Fund - Class I,Other Securities,Sale (Partial),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/1...,2022-06-03


In [18]:
senate[senate['asset_type'] == 'PDF Disclosed Filing'].tail()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
9393,2012-08-17,,This filing was disclosed via scanned PDF. Use...,PDF Disclosed Filing,,Unknown,,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/paper...,2012-08-17
9394,2012-08-16,,This filing was disclosed via scanned PDF. Use...,PDF Disclosed Filing,,Unknown,,Pat Roberts,https://efdsearch.senate.gov/search/view/paper...,2012-08-16
9395,2012-08-15,,This filing was disclosed via scanned PDF. Use...,PDF Disclosed Filing,,Unknown,,Robert J Portman,https://efdsearch.senate.gov/search/view/paper...,2012-08-15
9396,2012-08-02,,This filing was disclosed via scanned PDF. Use...,PDF Disclosed Filing,,Unknown,,Thomas R Carper,https://efdsearch.senate.gov/search/view/paper...,2012-08-02
9397,2012-07-25,,This filing was disclosed via scanned PDF. Use...,PDF Disclosed Filing,,Unknown,,Benjamin L Cardin,https://efdsearch.senate.gov/search/view/paper...,2012-07-25


We know right now that Municipal Security, Corporate Bond, Non-Public Stock, Commodities/Futures Contract, and Cryptocurrency represent transactions that don't provide information that's helpful to our model/planned analysis. PDF Disclosed Filing are basically null, and while it's possible to look at the PDFs and enter the data, we don't have the time for that on this project, unfortunately, and the fraction of instances that represents is small.

In [19]:
len(senate)

9398

We found this repeat approach to be best because of the difficulties of continuing to include the NaNs.

In [20]:
senate = senate[senate['asset_type'] != 'Municipal Security']
senate = senate[senate['asset_type'] != 'Corporate Bond']
senate = senate[senate['asset_type'] != 'Non-Public Stock']
senate = senate[senate['asset_type'] != 'Commodities/Futures Contract']
senate = senate[senate['asset_type'] != 'Cryptocurrency']
senate = senate[senate['asset_type'] != 'PDF Disclosed Filing']
len(senate)

8129

In [21]:
senate['asset_type'].value_counts(dropna = False)

Stock               6756
NaN                  666
Other Securities     454
Stock Option         253
Name: asset_type, dtype: int64

In [22]:
senate.isnull().sum()

transaction_date       0
ticker                 0
asset_description      0
asset_type           666
type                   0
amount                 0
comment                0
senator                0
ptr_link               0
disclosure_date        0
dtype: int64

This method results in all other NaNs being removed -- this is largely if not entirely because of the 'PDF' asset type.

# Type
We're dropping Exchanges because we can't work with them for our analysis. We also made the values here match the values from the House database in formate.

In [23]:
senate['type'].value_counts()

Purchase          4033
Sale (Full)       2180
Sale (Partial)    1824
Exchange            92
Name: type, dtype: int64

In [24]:
senate = senate[senate['type'] != 'Exchange']
senate['type'].value_counts()

Purchase          4033
Sale (Full)       2180
Sale (Partial)    1824
Name: type, dtype: int64

In [25]:
senate['type'] = senate['type'].map({
    'Purchase': 'purchase',
    'Sale (Full)': 'sale_full',
    'Sale (Partial)': 'sale_partial'
                                    })
senate['type'].value_counts()

purchase        4033
sale_full       2180
sale_partial    1824
Name: type, dtype: int64

# Ticker

In [26]:
senate['ticker'].value_counts()

--                    1047
AAPL                   177
BAC                     86
MSFT                    85
NFLX                    79
PFE                     76
DISCA                   74
DIS                     73
T                       70
FEYE                    67
FDC                     66
URBN                    65
CZR                     63
FB                      58
AMZN                    55
NVDA                    54
WFC                     51
GE                      49
MRK                     45
PG                      45
XOM                     44
WMT                     44
WPX                     44
CVS                     43
DD                      43
GM                      43
BA                      42
HBI                     42
CSCO                    42
FDX                     41
GPK                     40
INTC                    40
CVX                     39
MOS                     39
GLW                     39
BWXT                    39
GILD                    38
S

In [27]:
tickers = pd.read_csv('./loren_data/updated_ticker_symbols-v1.csv')
#tickers = tickers[['symbol', 'Name']]
#tickers.columns = tickers.columns.map(lambda x: x.lower())

tickers.head()

Unnamed: 0,symbol,name
0,AAPL,Apple Inc.
1,XOM,Exxon Mobil Corporation
2,MSFT,Microsoft Corporation
3,BAC^I,Bank of America Corporation
4,IBM,International Business Machines Corporation


In [28]:
#adding in items from the Senate dataframe that took some research.
df2 = pd.DataFrame({
    'symbol': ['CAB', 'PFE', 'AAPL', 'AMZN', 'NKE', 'PCP', 'UAA', 'LIT', 'NFLX', 'FEZ', 'CIT', 'LNT', 'WW', 'UAA', 'PHLD'],
    'name': ["Cabela's Inc", 'Pfizer Inc', 'aapl', 'Amazon', 'Nike Inc B', 'Precision Castparts Corp', 
             'Under Armour Inc', 'Global X Lithium Battery', 'nflx', 'SPDR Euro Stoxx 50', 'CIT Group Inc (CIT)',
            'Alliant Energy Corp', 'Weight Watchers Intl Inc', 'Under Armour Inc Cl A', 'PHLD - Phillips Edison Grocery Center REIT I']
})

tickers = pd.concat([tickers, df2])

The following removes a few items that cause errors in finding the symbol from the ticker dataframe.

In [29]:
senate['asset_description'] = senate['asset_description'].str.replace(' &amp;', '')
senate['asset_description'] = senate['asset_description'].str.replace(' Common Stock', '')
senate['asset_description'] = senate['asset_description'].str.replace(' CMN', '')

In [30]:
senate['ticker'].value_counts()[0]

1047

The following finds and fills in 30 '--' tickers.

In [31]:
count = 0
name_set = set()
symbol_set = set()
for i in senate.loc[senate['ticker'] == '--']['asset_description']:
    if i in tickers['name'].unique():
        count +=1
        name_set.add(i)
        symbol = tickers.loc[tickers['name'] == i, 'symbol'].values
        #print(symbol)
        symbol_set.add(symbol[0])
        senate.loc[((senate['ticker'] == '--') & (senate['asset_description'] == i)), 'ticker'] = symbol[0]
print(count)
print(name_set)
print(symbol_set)

97
{'Tesoro Logistics LP', 'General Electric Company', 'Altria Group', 'CIT Group Inc (CIT)', 'Quest Diagnostics Incorporated', 'Bristol-Myers Squibb Company', 'ConocoPhillips', 'Wal-Mart Stores, Inc.', 'Zimmer Holdings, Inc.', 'Weight Watchers Intl Inc', 'Vector Group Ltd.', 'Costco Wholesale Corporation', 'Lincoln National Corporation', 'Under Armour Inc', 'Apple Inc.', 'Global X Lithium Battery', 'Amazon.com, Inc.', 'Under Armour Inc Cl A', 'aapl', '3M Company', 'Pfizer Inc', 'nflx', 'PHLD - Phillips Edison Grocery Center REIT I', 'Fifth Third Bancorp', "Cabela's Inc", 'Phillips 66', 'Alliant Energy Corp', 'Verizon Communications Inc.', 'Precision Castparts Corp', 'Target Corporation', 'Nike Inc B', 'Amazon'}
{'COP', 'CAB', 'TLLP', 'LIT', 'PHLD', 'DGX', 'MMM', 'FITB', 'AAPL', 'WMT', 'PFE', 'TGT', 'NFLX', 'BMY', 'PCP', 'UAA', 'GE', 'NKE', 'CIT', 'VZ', 'LNC', 'PSX', 'LNT', 'ZMH', 'VGR', 'COST', 'AMZN', 'MO', 'WW'}


In [32]:
senate['ticker'].value_counts()

--                    950
AAPL                  185
BAC                    86
MSFT                   85
PFE                    83
NFLX                   83
DISCA                  74
DIS                    73
T                      70
FEYE                   67
FDC                    66
URBN                   65
CZR                    63
AMZN                   61
FB                     58
NVDA                   54
WFC                    51
GE                     50
MRK                    45
PG                     45
WMT                    45
WPX                    44
XOM                    44
GM                     43
DD                     43
CVS                    43
CSCO                   42
HBI                    42
BA                     42
FDX                    41
GPK                    40
INTC                   40
BWXT                   39
GLW                    39
CVX                    39
MOS                    39
ENTG                   38
QCOM                   38
GILD        

In [33]:
senate['asset_description'] = senate['asset_description'].str.replace(' ETF', '')

In [36]:
senate['possible_ticker'] = ''

for i in senate[senate['ticker'] == '--']['asset_description'].index:
    asset_descr = senate.loc[i, 'asset_description']
    #print(asset_descr)
    ticker = re.findall('[A-Z]{2,6}', asset_descr)
    #print(ticker)
    if len(ticker) == 1 and ticker != 'LLC' and ticker != 'ETF':
        senate.loc[i, 'possible_ticker'] = ticker[0]

senate['possible_ticker'].value_counts(dropna = False)

          7566
SPDR        21
JPM         18
WMT         12
WFM         11
ETE         10
UBS         10
GE           9
CVC          9
MA           8
HSBC         8
MSCI         7
MS           7
SXL          7
BAX          6
NFLX         6
PFS          6
AAPL         6
US           5
LNT          5
WLK          5
NY           5
NGLS         5
LYV          4
CSCO         4
DVN          4
MOS          4
PLC          4
ADR          4
MLP          4
PG           4
AQR          4
CA           4
GILD         4
MSFT         3
WPX          3
KN           3
KMI          3
GLW          3
SE           3
DISCA        3
QCOM         3
XOM          3
CVS          3
AVP          3
BK           3
EI           3
BIIB         3
CRMT         2
SPY          2
LP           2
SYY          2
AB           2
VIAB         2
FOXA         2
TJX          2
FEYE         2
ESV          2
ORCL         2
VFC          2
LQDT         2
PAGP         2
CVX          2
USG          2
ADT          2
CBS          2
AXLL      

In [37]:
#ticker['possible_in_tickers_df'] = 0

for i in senate.loc[senate['ticker'] == '--']['possible_ticker']:
    if i in tickers['symbol'].unique():
        #count +=1
        #name_set.add(i)
        symbol = tickers.loc[tickers['symbol'] == i, 'symbol'].values
        #print(symbol)
        #symbol_set.add(symbol[0])
        senate.loc[((senate['ticker'] == '--') & (senate['possible_ticker'] == i)), 'ticker'] = symbol[0]

In [38]:
senate['ticker'].value_counts(dropna = False)[:3]

--      605
AAPL    191
NFLX     89
Name: ticker, dtype: int64

In [39]:
senate[senate['ticker'] == '--']['asset_description'].value_counts(dropna = False)

Israel Bond                                                                                                                                                                                                                                                                                                                        7
Revival LOC, LLC <div class="text-muted"> <em>Company:</em> Revival LOC, LLC &nbsp;(Springfield, NJ) </div> <div class="text-muted"><em>Description:</em>&nbsp;Line of Credit - Manufacturer</div>                                                                                                                                 5
Aaron Rents Inc                                                                                                                                                                                                                                                                                                                    4
Liberty Partners, LLC <di

In [40]:
#dropping 'possible_ticker'

senate.drop(columns = 'possible_ticker', inplace = True)
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
3,2022-06-03,WFC,Wells Fargo Company,Stock,purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5...,2022-06-13
4,2022-05-31,X,"United States Steel Corporation <div class=""te...",Stock Option,sale_partial,"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,2022-06-13
5,2022-05-31,X,"United States Steel Corporation <div class=""te...",Stock Option,sale_full,"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,2022-06-13
6,2022-05-31,X,United States Steel Corporation,Stock,purchase,"$100,001 - $250,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,2022-06-13
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",Stock,purchase,"$50,001 - $100,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,2022-06-13


In [41]:
senate['ticker'].value_counts().sum()

8037

We have 8037 rows

In [42]:
(601/8933) * 100

6.727862979961939

At this point, about 7% of data has no ticker. We accept the loss of that data because there's nothing left we can do without getting even more time intensive.

In [43]:
senate = senate[senate['ticker'] != '--']

senate['ticker'].value_counts(dropna = False)[:10]

AAPL     191
NFLX      89
MSFT      88
BAC       87
PFE       84
DISCA     77
DIS       75
T         70
FEYE      67
FDC       66
Name: ticker, dtype: int64

# Null Asset Type

In [44]:
senate['asset_type'].value_counts(dropna = False)

Stock               6352
NaN                  577
Other Securities     252
Stock Option         251
Name: asset_type, dtype: int64

In [45]:
senate[senate['asset_type'].isnull()].head()

Unnamed: 0,transaction_date,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
8479,2014-12-18,TJX,"The TJX Companies, Inc. (NYSE)",,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8480,2014-12-18,ORCL,Oracle Corporation (NYSE),,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8481,2014-12-18,T,"AT&amp;T, Inc. (NYSE)",,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8482,2014-12-18,KORS,Michael Kors Holdings Limited (NYSE),,sale_partial,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05
8483,2014-11-25,PCAR,PACCAR Inc. (NASDAQ),,purchase,"$1,001 - $15,000",--,Sheldon Whitehouse,https://efdsearch.senate.gov/search/view/ptr/3...,2015-01-05


## Eliminating Remaining Columns
At this point it appears the remaining nulls in asset_type correspond to otherwise good columns, so we eliminated that and other columns we don't need.

In [46]:
senate = senate.drop(columns=['asset_type', 'comment', 'ptr_link'])
senate.isnull().sum()

transaction_date     0
ticker               0
asset_description    0
type                 0
amount               0
senator              0
disclosure_date      0
dtype: int64

Let's drop any null rows

In [47]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7432 entries, 3 to 9170
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7432 non-null   datetime64[ns]
 1   ticker             7432 non-null   object        
 2   asset_description  7432 non-null   object        
 3   type               7432 non-null   object        
 4   amount             7432 non-null   object        
 5   senator            7432 non-null   object        
 6   disclosure_date    7432 non-null   datetime64[ns]
dtypes: datetime64[ns](2), object(5)
memory usage: 464.5+ KB


# `'amount'`

In [48]:
senate['amount'].value_counts()

$1,001 - $15,000             5343
$15,001 - $50,000            1314
$50,001 - $100,000            456
$100,001 - $250,000           249
$250,001 - $500,000            40
$500,001 - $1,000,000          11
$1,000,001 - $5,000,000        11
$5,000,001 - $25,000,000        6
Over $50,000,000                1
$25,000,001 - $50,000,000       1
Name: amount, dtype: int64

We see that one of the Senators invested over 50 million dollars in 1 trade! Let's see who it was

In [49]:
senate[senate['amount'] == "Over $50,000,000"]['senator']

7606    James M Inhofe
Name: senator, dtype: object

## Making the Amounts into the High End and Integers

In [50]:
senate['amount'] = senate['amount'].str.split(' ') 

for i in senate.index:
    if len(senate.loc[i, 'amount']) == 3:
        senate.loc[i, 'amount'] = senate.loc[i, 'amount'][2]
    else:
        senate.loc[i, 'amount'] = senate.loc[i, 'amount'][1]
        
senate['amount'] = senate['amount'].map(lambda x: x.replace('$', '').replace(',', ''))
senate['amount'] = senate['amount'].astype(int)
senate['amount'].value_counts()

15000       5343
50000       1314
100000       456
250000       249
500000        40
1000000       11
5000000       11
25000000       6
50000000       2
Name: amount, dtype: int64

For some reason that didn't change all the values. We will change the rest manually

In [51]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,senator,disclosure_date
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,"A. Mitchell Mcconnell, Jr.",2022-06-13
4,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_partial,50000,Thomas H Tuberville,2022-06-13
5,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_full,50000,Thomas H Tuberville,2022-06-13
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13


In [52]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7432 entries, 3 to 9170
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7432 non-null   datetime64[ns]
 1   ticker             7432 non-null   object        
 2   asset_description  7432 non-null   object        
 3   type               7432 non-null   object        
 4   amount             7432 non-null   int32         
 5   senator            7432 non-null   object        
 6   disclosure_date    7432 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int32(1), object(4)
memory usage: 693.5+ KB


# Column Names and Chamber Column
Now I will rename the senator column to be 'name' and make a column named 'chamber' where the value is all senator

In [53]:
senate.rename(columns = {'senator':'name'}, inplace = True)
senate['chamber'] = 'senate'

In [54]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,"A. Mitchell Mcconnell, Jr.",2022-06-13,senate
4,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate
5,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_full,50000,Thomas H Tuberville,2022-06-13,senate
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate


# State, Birthday, Gender
Because of the changes in the dataframe, the dictionary is missing some people. I'm going to see if the House workflow works. 

In [55]:
legislators = pd.read_csv('./loren_data/relevant_legislators-v1.csv')

In [56]:
legislators.head()

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
0,Brown,Sherrod,,,Sherrod Brown,1952-11-09,M,sen,OH,Democrat,
1,Cantwell,Maria,,,Maria Cantwell,1958-10-13,F,sen,WA,Democrat,
2,Cardin,Benjamin,L.,,Benjamin L. Cardin,1943-10-05,M,sen,MD,Democrat,
3,Carper,Thomas,Richard,,Thomas R. Carper,1947-01-23,M,sen,DE,Democrat,
4,Casey,Robert,P.,Jr.,"Robert P. Casey, Jr.",1960-04-13,M,sen,PA,Democrat,


In [57]:
legislators[legislators['full_name'] == 'Thomas H Tuberville']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district


In [58]:
legislators[legislators['full_name'] == 'A. Mitchell Mcconnell, Jr.']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district


In [59]:
legislators[legislators['last_name'] == 'Tuberville']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
463,Tuberville,Tommy,Hawley,,Tommy Tuberville,1954-09-18,M,sen,AL,Republican,


In [60]:
legislators[legislators['last_name'] == 'McConnell']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
19,McConnell,Mitch,,,Mitch McConnell,1942-02-20,M,sen,KY,Republican,


In [61]:
senate['name'] = senate['name'].map(lambda x: x.replace(', Jr.', ''))
senate['name'] = senate['name'].map(lambda x: x.replace(', Jr', ''))
senate['name'] = senate['name'].map(lambda x: x.replace(', Iv', ''))
senate['name'] = senate['name'].map(lambda x: x.replace(', Iii', ''))
senate['name'] = senate['name'].map(lambda x: x.replace('Jerry Moran,', 'Jerry Moran'))

In [62]:
senate['first_name'] = senate['name'].map(lambda x: x.split()[0])
senate['last_name'] = senate['name'].map(lambda x: x.split()[-1])
senate.loc[senate['first_name']== 'A.', 'first_name'] = 'Mitchell'
senate.loc[senate['last_name']== 'Mcconnell', 'last_name'] = 'McConnell'
senate.loc[senate['last_name']== 'Hollen', 'last_name'] = 'Van Hollen'

In [63]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell
4,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville
5,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville


In [64]:
senate[senate['first_name'] == 'A.']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name


In [65]:
senate[senate['last_name'] == 'Jr.']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name


In [66]:
senate[senate['last_name'] == 'Manchin,']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name


In [67]:
len(senate['last_name'].value_counts().keys())

48

In [68]:
len(senate['first_name'].value_counts().keys())

38

In [69]:
senate['party'] = ''
senate['birthday'] = ''
senate['gender'] = ''
senate['represents'] = ''

count = 0
name = set()
for i in senate.index:

    #first_name = senate.loc[i, 'first_name']
    last_name = senate.loc[i, 'last_name']
    party = legislators[(legislators['last_name'] == last_name)]['party'].values #(legislators['first_name'] == first_name) & 
    birthday = legislators[(legislators['last_name'] == last_name)]['birthday'].values #(legislators['first_name'] == first_name) & 
    gender = legislators[(legislators['last_name'] == last_name)]['gender'].values #(legislators['first_name'] == first_name) &
    state = legislators[(legislators['last_name'] == last_name)]['state'].values #(legislators['first_name'] == first_name) &
    #if len(party) == 0:
    #    party = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['party'].values
    #    birthday = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['birthday'].values
    #    gender = relevant_legislators[(relevant_legislators['district'] == float(house.loc[i, 'represents'][-2:])) & (relevant_legislators['last_name'].map(lambda x: x.split()[-1]) == last_name)]['gender'].values
    if len(party) == 0:
        name.add(str(first_name) + ' ' + str(last_name))
    count += 1
    senate.loc[i, 'party'] = party[0]
    senate.loc[i, 'birthday'] = birthday[0]
    senate.loc[i, 'gender'] = gender[0]
    senate.loc[i, 'represents'] = state[0]
    #except:
        #print(i)
        #break
print(len(name))  

0


In [70]:
count

7432

In [71]:
name

set()

In [72]:
party

array(['Democrat', 'Republican'], dtype=object)

In [73]:
last_name

'Reed'

In [74]:
legislators[legislators['last_name'] == 'Van Hollen']

Unnamed: 0,last_name,first_name,middle_name,suffix,full_name,birthday,gender,type,state,party,district
198,Van Hollen,Chris,,,Chris Van Hollen,1959-01-10,M,sen,MD,Democrat,


In [75]:
senate[senate['last_name'] == 'Reed']

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
3418,2019-02-28,XLNX,"Xilinx, Inc.",sale_full,50000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3419,2019-02-28,TMO,Thermo Fisher Scientific Inc.,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3420,2019-02-28,SYK,Stryker Corporation,sale_full,50000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3421,2019-02-28,SLB,Schlumberger Limited,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3422,2019-02-28,QCOM,QUALCOMM Incorporated,sale_full,50000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3423,2019-02-28,PEP,"PepsiCo, Inc.",sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3424,2019-02-28,MSFT,Microsoft Corporation,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3425,2019-02-28,ISRG,"Intuitive Surgical, Inc.",sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3426,2019-02-28,INTC,Intel Corporation,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI
3427,2019-02-28,IQV,IQVIA Holdings Inc.,sale_full,15000,John F Reed,2019-03-18,senate,John,Reed,Democrat,1949-11-12,M,RI


In [76]:
senate['birthday'] = pd.to_datetime(senate['birthday'], format = '%Y-%m-%d')
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7432 entries, 3 to 9170
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7432 non-null   datetime64[ns]
 1   ticker             7432 non-null   object        
 2   asset_description  7432 non-null   object        
 3   type               7432 non-null   object        
 4   amount             7432 non-null   int32         
 5   name               7432 non-null   object        
 6   disclosure_date    7432 non-null   datetime64[ns]
 7   chamber            7432 non-null   object        
 8   first_name         7432 non-null   object        
 9   last_name          7432 non-null   object        
 10  party              7432 non-null   object        
 11  birthday           7432 non-null   datetime64[ns]
 12  gender             7432 non-null   object        
 13  represents         7432 non-null   object        
dtypes: datet

In [77]:
senate.columns

Index(['transaction_date', 'ticker', 'asset_description', 'type', 'amount',
       'name', 'disclosure_date', 'chamber', 'first_name', 'last_name',
       'party', 'birthday', 'gender', 'represents'],
      dtype='object')

In [78]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell,Republican,1942-02-20,M,KY
4,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
5,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL


Finally, we will export the cleaned csv file.

In [79]:
senate.to_csv('loren_data/loren_cleaned_senate.csv')

In [80]:
house = pd.read_csv('loren_data/clean_house_2022-07-15.csv')

In [81]:
all_reps = pd.concat([senate, house], axis = 0)

In [82]:
all_reps['transaction_date'] = pd.to_datetime(all_reps['transaction_date'], format = '%Y-%m-%d')
all_reps['disclosure_date'] = pd.to_datetime(all_reps['disclosure_date'], format = '%Y-%m-%d')
all_reps['birthday'] = pd.to_datetime(all_reps['birthday'], format = '%Y-%m-%d')

In [83]:
all_reps.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
3,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell,Republican,1942-02-20,M,KY
4,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
5,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
6,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
7,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL


In [84]:
all_reps.tail()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
13606,2020-04-09,SWK,"Stanley Black & Decker, Inc.",sale_partial,15000,Ed Perlmutter,2020-06-10,house,,,Democrat,1953-05-01,M,CO07
13607,2020-04-09,USB,U.S. Bancorp,sale_partial,15000,Ed Perlmutter,2020-06-10,house,,,Democrat,1953-05-01,M,CO07
13608,2020-03-13,BMY,Bristol-Myers Squibb Company,sale_full,250000,Nicholas Van Taylor,2020-06-10,house,,,Republican,1972-08-01,M,TX03
13609,2020-03-13,LLY,Eli Lilly and Company,sale_full,1000000,Nicholas Van Taylor,2020-06-10,house,,,Republican,1972-08-01,M,TX03
13610,2020-03-13,DIS,Walt Disney Company,sale_full,500000,Nicholas Van Taylor,2020-06-10,house,,,Republican,1972-08-01,M,TX03


In [85]:
all_reps.reset_index(drop = True, inplace = True)
all_reps.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,first_name,last_name,party,birthday,gender,represents
0,2022-06-03,WFC,Wells Fargo Company,purchase,15000,A. Mitchell Mcconnell,2022-06-13,senate,Mitchell,McConnell,Republican,1942-02-20,M,KY
1,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_partial,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
2,2022-05-31,X,"United States Steel Corporation <div class=""te...",sale_full,50000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
3,2022-05-31,X,United States Steel Corporation,purchase,250000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL
4,2022-05-20,PYPL,"PayPal Holdings, Inc. -",purchase,100000,Thomas H Tuberville,2022-06-13,senate,Thomas,Tuberville,Republican,1954-09-18,M,AL


In [86]:
all_reps.shape

(21043, 14)

In [87]:
all_reps.to_csv('./loren_data/loren_complete_data.csv', index = False)

In [88]:
tickers.to_csv('loren_data/updated_ticker_symbols-v2.csv', index = False)

___