In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import requests
from bs4 import BeautifulSoup

In [2]:
proxies = {'https': 'http://127.0.0.1:7769'} #I need this on my computer

In [3]:
#taken from https://senatestockwatcher.com/api
senate = pd.read_csv('datasets/all_transactions_senate.csv')

In [4]:
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
0,6/21/2022,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate...",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022
1,6/15/2022,Spouse,--,"Office Properties Income Trust <div class=""tex...",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022
2,5/17/2022,Spouse,--,Lee County Florida Health Care Facilities Reve...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8...,6/16/2022
3,6/3/2022,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5...,6/13/2022
4,5/31/2022,Joint,X,United States Steel Corporation Common Stock <...,Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,6/13/2022


Let's turn the dates into datetimes

In [5]:
senate['transaction_date'] = pd.to_datetime(senate['transaction_date'])
type(senate['transaction_date'][0])

pandas._libs.tslibs.timestamps.Timestamp

Now let's create a year column

In [6]:
senate['year'] = senate['transaction_date'].map(lambda x: x.year)
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date,year
0,2022-06-21,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate...",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022,2022
1,2022-06-15,Spouse,--,"Office Properties Income Trust <div class=""tex...",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022,2022
2,2022-05-17,Spouse,--,Lee County Florida Health Care Facilities Reve...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8...,6/16/2022,2022
3,2022-06-03,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5...,6/13/2022,2022
4,2022-05-31,Joint,X,United States Steel Corporation Common Stock <...,Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,6/13/2022,2022


In [7]:
senate['year'].value_counts()

2020    1535
2018    1395
2017    1372
2015    1152
2019    1037
2016     977
2014     727
2021     612
2022     347
2013     184
2012      60
Name: year, dtype: int64

Now we see how far back in terms of year, we can drop the column

In [8]:
senate = senate.drop(columns=['year'])
senate.head()

Unnamed: 0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,senator,ptr_link,disclosure_date
0,2022-06-21,Spouse,--,"Broadcom Corp <div class=""text-muted""><em>Rate...",Corporate Bond,Purchase,"$15,001 - $50,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022
1,2022-06-15,Spouse,--,"Office Properties Income Trust <div class=""tex...",Corporate Bond,Sale (Full),"$1,001 - $15,000",--,Thomas R Carper,https://efdsearch.senate.gov/search/view/ptr/a...,7/5/2022
2,2022-05-17,Spouse,--,Lee County Florida Health Care Facilities Reve...,Municipal Security,Purchase,"$500,001 - $1,000,000",--,Rick Scott,https://efdsearch.senate.gov/search/view/ptr/8...,6/16/2022
3,2022-06-03,Spouse,WFC,Wells Fargo &amp; Company Common Stock,Stock,Purchase,"$1,001 - $15,000",Dividend Reinvestment,"A. Mitchell Mcconnell, Jr.",https://efdsearch.senate.gov/search/view/ptr/5...,6/13/2022
4,2022-05-31,Joint,X,United States Steel Corporation Common Stock <...,Stock Option,Sale (Partial),"$15,001 - $50,000",--,Thomas H Tuberville,https://efdsearch.senate.gov/search/view/ptr/4...,6/13/2022


Let's look at null values

In [9]:
senate.isnull().sum()

transaction_date       0
owner                465
ticker               465
asset_description      0
asset_type           666
type                 465
amount                 0
comment              465
senator                0
ptr_link               0
disclosure_date        0
dtype: int64

We can see that there is a lot of null values. However, we won't need owner, asset_type, comment, or ptr_link

In [10]:
senate = senate.drop(columns=['owner', 'asset_type', 'comment', 'ptr_link'])
senate.isnull().sum()

transaction_date       0
ticker               465
asset_description      0
type                 465
amount                 0
senator                0
disclosure_date        0
dtype: int64

We now just have to deal with ticker and type. Before we do that though, let's look at how much these Senators are trading

In [11]:
senate['amount'].value_counts()

$1,001 - $15,000             6084
$15,001 - $50,000            1664
$50,001 - $100,000            551
Unknown                       465
$100,001 - $250,000           335
$250,001 - $500,000           138
$500,001 - $1,000,000         111
$1,000,001 - $5,000,000        35
$5,000,001 - $25,000,000       12
$25,000,001 - $50,000,000       2
Over $50,000,000                1
Name: amount, dtype: int64

We see that one of the Senators invested over 50 million dollars in 1 trade! Let's see who it was

In [12]:
senate[senate['amount'] == "Over $50,000,000"]['senator']

7606    James M Inhofe
Name: senator, dtype: object

In [13]:
pd.options.display.max_rows = None


Let's look at the different tickers used in the data.

In [14]:
senate['ticker'].value_counts()

--                    1858
AAPL                   177
BAC                     86
MSFT                    85
NFLX                    79
PFE                     77
DIS                     75
DISCA                   74
T                       70
FEYE                    67
FDC                     66
URBN                    65
CZR                     63
FB                      58
AMZN                    55
NVDA                    54
WFC                     51
GE                      49
MRK                     46
PG                      45
WPX                     44
XOM                     44
DD                      44
WMT                     44
GM                      43
CVS                     43
HBI                     42
CSCO                    42
BA                      42
FDX                     41
INTC                    40
GPK                     40
MOS                     39
GLW                     39
CVX                     39
BWXT                    39
LYV                     38
Q

We see that 1858 entries have no ticker on them. Let's find out what percentage of the day that is. But first let's find out how many rows we have.

In [15]:
senate['ticker'].value_counts().sum()

8933

We have 8933 rows

In [16]:
(1858/8933) * 100

20.799283555356542

So almost 20.8% of the data has no ticker price. Sadily we will have to drop it, as we don't have anything to compare these trades to. Luckily we will still have over 6000 rows which is plenty to work with.

In [17]:
senate = senate[senate['ticker'] != '--']

In [18]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,senator,disclosure_date
3,2022-06-03,WFC,Wells Fargo &amp; Company Common Stock,Purchase,"$1,001 - $15,000","A. Mitchell Mcconnell, Jr.",6/13/2022
4,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Partial),"$15,001 - $50,000",Thomas H Tuberville,6/13/2022
5,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Full),"$15,001 - $50,000",Thomas H Tuberville,6/13/2022
6,2022-05-31,X,United States Steel Corporation Common Stock,Purchase,"$100,001 - $250,000",Thomas H Tuberville,6/13/2022
7,2022-05-20,PYPL,"PayPal Holdings, Inc. - Common Stock",Purchase,"$50,001 - $100,000",Thomas H Tuberville,6/13/2022


In [19]:
senate['ticker'].value_counts()

AAPL                  177
BAC                    86
MSFT                   85
NFLX                   79
PFE                    77
DIS                    75
DISCA                  74
T                      70
FEYE                   67
FDC                    66
URBN                   65
CZR                    63
FB                     58
AMZN                   55
NVDA                   54
WFC                    51
GE                     49
MRK                    46
PG                     45
XOM                    44
WPX                    44
DD                     44
WMT                    44
GM                     43
CVS                    43
HBI                    42
BA                     42
CSCO                   42
FDX                    41
GPK                    40
INTC                   40
MOS                    39
GLW                    39
BWXT                   39
CVX                    39
SBUX                   38
QCOM                   38
GILD                   38
LYV         

Now let's check our Null values again

In [20]:
senate.isnull().sum()

transaction_date       0
ticker               465
asset_description      0
type                 465
amount                 0
senator                0
disclosure_date        0
dtype: int64

Let's drop any null rows

In [21]:
senate.dropna(inplace=True)

In [22]:
senate.isnull().sum()

transaction_date     0
ticker               0
asset_description    0
type                 0
amount               0
senator              0
disclosure_date      0
dtype: int64

Let's run head again and see what we need to edit.

In [23]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,senator,disclosure_date
3,2022-06-03,WFC,Wells Fargo &amp; Company Common Stock,Purchase,"$1,001 - $15,000","A. Mitchell Mcconnell, Jr.",6/13/2022
4,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Partial),"$15,001 - $50,000",Thomas H Tuberville,6/13/2022
5,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Full),"$15,001 - $50,000",Thomas H Tuberville,6/13/2022
6,2022-05-31,X,United States Steel Corporation Common Stock,Purchase,"$100,001 - $250,000",Thomas H Tuberville,6/13/2022
7,2022-05-20,PYPL,"PayPal Holdings, Inc. - Common Stock",Purchase,"$50,001 - $100,000",Thomas H Tuberville,6/13/2022


In [24]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7075 entries, 3 to 9165
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7075 non-null   datetime64[ns]
 1   ticker             7075 non-null   object        
 2   asset_description  7075 non-null   object        
 3   type               7075 non-null   object        
 4   amount             7075 non-null   object        
 5   senator            7075 non-null   object        
 6   disclosure_date    7075 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 442.2+ KB


We need to change amount and the discloser date. Let's round down for amount.

In [25]:
senate['amount'].value_counts()

$1,001 - $15,000             5066
$15,001 - $50,000            1248
$50,001 - $100,000            442
$100,001 - $250,000           249
$250,001 - $500,000            43
$500,001 - $1,000,000          11
$1,000,001 - $5,000,000        10
$5,000,001 - $25,000,000        5
$25,000,001 - $50,000,000       1
Name: amount, dtype: int64

In [26]:
list_of_amounts = []
nums_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
for i in senate['amount']:
    c = []
    for j in i:
        if j in nums_list:
            c.append(j)
        elif j == " ":
            break
    list_of_amounts.append(int("".join(c)))
list_of_amounts

[1001,
 15001,
 15001,
 100001,
 50001,
 250001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 100001,
 100001,
 250001,
 100001,
 1001,
 100001,
 1001,
 15001,
 50001,
 1001,
 50001,
 100001,
 15001,
 1001,
 1001,
 15001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 15001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 15001,
 1001,
 1001,
 15001,
 15001,
 50001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 15001,
 15001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 50001,
 50001,
 1001,
 1001,
 1001,
 1001,
 1001,
 1001,
 50001,
 1001,
 1001,
 1001,
 1001,
 1001,
 100001,
 1001,
 1001,
 1001,
 15001,
 1001,
 1001,
 1001,
 1001,
 50001,
 15001,
 1001,
 1

In [27]:
for i in range(len(list_of_amounts)):
    senate['amount'][i] = list_of_amounts[i]
senate['amount'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senate['amount'][i] = list_of_amounts[i]


1001                         4058
$1,001 - $15,000             1072
15001                        1028
50001                         346
100001                        224
$15,001 - $50,000             206
$50,001 - $100,000             46
250001                         42
$100,001 - $250,000            24
500001                         10
1000001                         9
5000001                         4
$250,001 - $500,000             4
25000001                        1
$25,000,001 - $50,000,000       1
Name: amount, dtype: int64

For some reason that didn't change all the values. We will change the rest manually

In [28]:
senate['amount'] = senate['amount'].map(lambda x: 1001 if x == '$1,001 - $15,000' else x)

In [29]:
senate['amount'] = senate['amount'].map(lambda x: 15001 if x == '$15,001 - $50,000' else x)

In [30]:
senate['amount'] = senate['amount'].map(lambda x: 50001 if x == '$50,001 - $100,000' else x)

In [31]:
senate['amount'] = senate['amount'].map(lambda x: 100001 if x == '$100,001 - $250,000' else x)

In [32]:
senate['amount'] = senate['amount'].map(lambda x: 250001 if x == '$250,001 - $500,000' else x)

In [33]:
senate['amount'] = senate['amount'].map(lambda x: 2500001 if x == '$25,000,001 - $50,000,000' else x)

In [34]:
senate['amount'].value_counts()

1001        5130
15001       1234
50001        392
100001       248
250001        46
500001        10
1000001        9
5000001        4
25000001       1
2500001        1
Name: amount, dtype: int64

In [35]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,senator,disclosure_date
3,2022-06-03,WFC,Wells Fargo &amp; Company Common Stock,Purchase,100001,"A. Mitchell Mcconnell, Jr.",6/13/2022
4,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Partial),50001,Thomas H Tuberville,6/13/2022
5,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Full),250001,Thomas H Tuberville,6/13/2022
6,2022-05-31,X,United States Steel Corporation Common Stock,Purchase,1001,Thomas H Tuberville,6/13/2022
7,2022-05-20,PYPL,"PayPal Holdings, Inc. - Common Stock",Purchase,1001,Thomas H Tuberville,6/13/2022


In [36]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7075 entries, 3 to 9165
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7075 non-null   datetime64[ns]
 1   ticker             7075 non-null   object        
 2   asset_description  7075 non-null   object        
 3   type               7075 non-null   object        
 4   amount             7075 non-null   int64         
 5   senator            7075 non-null   object        
 6   disclosure_date    7075 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 700.2+ KB


Now it is time for disclosure_date

In [37]:
senate['disclosure_date'] = pd.to_datetime(senate['disclosure_date'])
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7075 entries, 3 to 9165
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7075 non-null   datetime64[ns]
 1   ticker             7075 non-null   object        
 2   asset_description  7075 non-null   object        
 3   type               7075 non-null   object        
 4   amount             7075 non-null   int64         
 5   senator            7075 non-null   object        
 6   disclosure_date    7075 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 700.2+ KB


Now I will rename the senator column to be 'name' and make a column named 'chamber' where the value is all senator

In [38]:
senate.rename(columns = {'senator':'name'}, inplace = True)
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7075 entries, 3 to 9165
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7075 non-null   datetime64[ns]
 1   ticker             7075 non-null   object        
 2   asset_description  7075 non-null   object        
 3   type               7075 non-null   object        
 4   amount             7075 non-null   int64         
 5   name               7075 non-null   object        
 6   disclosure_date    7075 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 700.2+ KB


In [39]:
senate['chamber'] = 'senate'
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber
3,2022-06-03,WFC,Wells Fargo &amp; Company Common Stock,Purchase,100001,"A. Mitchell Mcconnell, Jr.",2022-06-13,senate
4,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Partial),50001,Thomas H Tuberville,2022-06-13,senate
5,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Full),250001,Thomas H Tuberville,2022-06-13,senate
6,2022-05-31,X,United States Steel Corporation Common Stock,Purchase,1001,Thomas H Tuberville,2022-06-13,senate
7,2022-05-20,PYPL,"PayPal Holdings, Inc. - Common Stock",Purchase,1001,Thomas H Tuberville,2022-06-13,senate


Now, we have add the state to each Senator. Let's see which Senator is on the list.

In [42]:
c = 0
for i in senate['name'].unique():
    c += 1
c

47

In [41]:
senate.name.unique()

array(['A. Mitchell Mcconnell, Jr.', 'Thomas H Tuberville',
       'Thomas R Carper', 'William F Hagerty, Iv', 'John W Hickenlooper',
       'Gary C Peters', 'Tina Smith', 'John R Thune', 'Ron L Wyden',
       'Shelley M Capito', 'Mark R Warner', 'Susan M Collins',
       'Jerry Moran,', 'Jacklyn S Rosen', 'Patrick J Toomey',
       'Cynthia M Lummis', 'Roy Blunt', 'Sheldon Whitehouse',
       'Angus S King, Jr.', 'John Boozman', 'Ladda Tammy Duckworth',
       'Daniel S Sullivan', 'James M Inhofe', 'Pat Roberts',
       'William Cassidy', 'Kelly Loeffler', 'Timothy M Kaine',
       'David A Perdue , Jr', 'Roger F Wicker', 'John Hoeven',
       'John N Kennedy', 'Rafael E Cruz', 'Christopher A Coons',
       'Thomas Udall', 'John F Reed', 'Thomas R Tillis',
       'Robert P Casey, Jr.', 'Tammy Duckworth', 'Michael F Bennet',
       'Patty Murray', 'Joseph Manchin, Iii', 'Chris Van Hollen',
       'John Cornyn', 'Maria Cantwell', 'Michael  B Enzi',
       'Benjamin L Cardin', 'Cory A Bo

In [43]:
senate_state_dict = {'A. Mitchell Mcconnell, Jr.':'KY', 'Thomas H Tuberville':'AL',
       'Thomas R Carper':'DE', 'William F Hagerty, Iv':'TN', 'John W Hickenlooper':'CO',
       'Gary C Peters':'MI', 'Tina Smith':'MN', 'John R Thune':'SD', 'Ron L Wyden':'OR',
       'Shelley M Capito':'WV', 'Mark R Warner':'VA', 'Susan M Collins':'ME',
       'Jerry Moran,':'KS', 'Jacklyn S Rosen':'NV', 'Patrick J Toomey':'PA',
       'Cynthia M Lummis':'WY', 'Roy Blunt':'MI', 'Sheldon Whitehouse':'NY',
       'Angus S King, Jr.':'VA', 'John Boozman':'AR', 'Ladda Tammy Duckworth':'IL',
       'Daniel S Sullivan':'AK', 'James M Inhofe':'OK', 'Pat Roberts':'KS',
       'William Cassidy':'LA', 'Kelly Loeffler':'GA', 'Timothy M Kaine':'VA',
       'David A Perdue , Jr':'GA', 'Roger F Wicker':'MS', 'John Hoeven':'ND',
       'John N Kennedy':'LA', 'Rafael E Cruz':'TX', 'Christopher A Coons':'DE',
       'Thomas Udall':'NM', 'John F Reed':'RI', 'Thomas R Tillis':'NC',
       'Robert P Casey, Jr.':'PA', 'Tammy Duckworth':'IL', 'Michael F Bennet':'CO',
       'Patty Murray':'WA', 'Joseph Manchin, Iii':'WV', 'Chris Van Hollen':'MD',
       'John Cornyn':'TX', 'Maria Cantwell':'WA', 'Michael  B Enzi':'WY',
       'Benjamin L Cardin':'MD', 'Cory A Booker':'NJ'}

In [44]:
senate_party_dict = {'A. Mitchell Mcconnell, Jr.':'Rep', 'Thomas H Tuberville':'Rep',
       'Thomas R Carper':'Dem', 'William F Hagerty, Iv':'Rep', 'John W Hickenlooper':'Dem',
       'Gary C Peters':'Dem', 'Tina Smith':'Dem', 'John R Thune':'Rep', 'Ron L Wyden':'Dem',
       'Shelley M Capito':'Rep', 'Mark R Warner':'Dem', 'Susan M Collins':'Rep',
       'Jerry Moran,':'Rep', 'Jacklyn S Rosen':'Dem', 'Patrick J Toomey':'Rep',
       'Cynthia M Lummis':'Rep', 'Roy Blunt':'Rep', 'Sheldon Whitehouse':'Dem',
       'Angus S King, Jr.':'Dem', 'John Boozman':'Rep', 'Ladda Tammy Duckworth':'Dem',
       'Daniel S Sullivan':'Rep', 'James M Inhofe':'Rep', 'Pat Roberts':'Rep',
       'William Cassidy':'Rep', 'Kelly Loeffler':'Rep', 'Timothy M Kaine':'Dem',
       'David A Perdue , Jr':'Rep', 'Roger F Wicker':'Rep', 'John Hoeven':'Rep',
       'John N Kennedy':'Rep', 'Rafael E Cruz':'Rep', 'Christopher A Coons':'Dem',
       'Thomas Udall':'Dem', 'John F Reed':'Dem', 'Thomas R Tillis':'Rep',
       'Robert P Casey, Jr.':'Dem', 'Tammy Duckworth':'Dem', 'Michael F Bennet':'Dem',
       'Patty Murray':'Dem', 'Joseph Manchin, Iii':'Dem', 'Chris Van Hollen':'Dem',
       'John Cornyn':'Rep', 'Maria Cantwell':'Dem', 'Michael  B Enzi':'Rep',
       'Benjamin L Cardin':'Dem', 'Cory A Booker':'Dem'}

In [45]:
senate_birthday_dict = {'A. Mitchell Mcconnell, Jr.':'1942-02-20', 'Thomas H Tuberville':'1954-09-18',
       'Thomas R Carper':'1947-01-23', 'William F Hagerty, Iv':'1959-08-14', 'John W Hickenlooper':'1957-02-07',
       'Gary C Peters':'1958-12-01', 'Tina Smith':'1958-03-04', 'John R Thune':'1961-01-07', 'Ron L Wyden':'1949-05-03',
       'Shelley M Capito':'1953-11-26', 'Mark R Warner':'1954-12-15', 'Susan M Collins':'1952-12-07',
       'Jerry Moran,':'1954-05-29', 'Jacklyn S Rosen':'1957-08-02', 'Patrick J Toomey':'1961-11-17',
       'Cynthia M Lummis':'1954-09-10', 'Roy Blunt':'1950-01-10', 'Sheldon Whitehouse':'1955-10-20',
       'Angus S King, Jr.':'1944-03-31', 'John Boozman':'1950-12-20', 'Ladda Tammy Duckworth':'1968-03-12',
       'Daniel S Sullivan':'1964-11-13', 'James M Inhofe':'1934-11-17', 'Pat Roberts':'1936-04-20',
       'William Cassidy':'1957-09-28', 'Kelly Loeffler':'1970-11-27', 'Timothy M Kaine':'1958-02-26',
       'David A Perdue , Jr':'1949-12-10', 'Roger F Wicker':'1951-07-05', 'John Hoeven':'1957-03-13',
       'John N Kennedy':'1951-11-21', 'Rafael E Cruz':'1970-12-22', 'Christopher A Coons':'1963-09-09',
       'Thomas Udall':'1948-05-18', 'John F Reed':'1949-11-12', 'Thomas R Tillis':'1960-08-30',
       'Robert P Casey, Jr.':'1960-04-13', 'Tammy Duckworth':'1968-03-12', 'Michael F Bennet':'1964-11-28',
       'Patty Murray':'1950-10-11', 'Joseph Manchin, Iii':'1947-08-24', 'Chris Van Hollen':'1959-01-10',
       'John Cornyn':'1952-02-02', 'Maria Cantwell':'1958-10-13', 'Michael  B Enzi':'1944-02-01',
       'Benjamin L Cardin':'1943-10-05', 'Cory A Booker':'1969-04-27'}

In [46]:
senate_gender_dict = {'A. Mitchell Mcconnell, Jr.':'M', 'Thomas H Tuberville':'M',
       'Thomas R Carper':'M', 'William F Hagerty, Iv':'M', 'John W Hickenlooper':'M',
       'Gary C Peters':'M', 'Tina Smith':'F', 'John R Thune':'M', 'Ron L Wyden':'M',
       'Shelley M Capito':'F', 'Mark R Warner':'M', 'Susan M Collins':'F',
       'Jerry Moran,':'M', 'Jacklyn S Rosen':'F', 'Patrick J Toomey':'M',
       'Cynthia M Lummis':'F', 'Roy Blunt':'M', 'Sheldon Whitehouse':'M',
       'Angus S King, Jr.':'M', 'John Boozman':'M', 'Ladda Tammy Duckworth':'F',
       'Daniel S Sullivan':'M', 'James M Inhofe':'M', 'Pat Roberts':'M',
       'William Cassidy':'M', 'Kelly Loeffler':'F', 'Timothy M Kaine':'M',
       'David A Perdue , Jr':'M', 'Roger F Wicker':'M', 'John Hoeven':'M',
       'John N Kennedy':'M', 'Rafael E Cruz':'M', 'Christopher A Coons':'M',
       'Thomas Udall':'M', 'John F Reed':'M', 'Thomas R Tillis':'M',
       'Robert P Casey, Jr.':'M', 'Tammy Duckworth':'F', 'Michael F Bennet':'M',
       'Patty Murray':'F', 'Joseph Manchin, Iii':'M', 'Chris Van Hollen':'M',
       'John Cornyn':'M', 'Maria Cantwell':'F', 'Michael  B Enzi':'M',
       'Benjamin L Cardin':'M', 'Cory A Booker':'M'}

In [47]:
senate['state'] = senate['name'].map(lambda x: senate_state_dict[x])

In [48]:
senate['party'] = senate['name'].map(lambda x: senate_party_dict[x])

In [49]:
senate['birthday'] = senate['name'].map(lambda x: senate_birthday_dict[x])

In [50]:
senate['birthday'] = pd.to_datetime(senate['birthday'])
senate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7075 entries, 3 to 9165
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   transaction_date   7075 non-null   datetime64[ns]
 1   ticker             7075 non-null   object        
 2   asset_description  7075 non-null   object        
 3   type               7075 non-null   object        
 4   amount             7075 non-null   int64         
 5   name               7075 non-null   object        
 6   disclosure_date    7075 non-null   datetime64[ns]
 7   chamber            7075 non-null   object        
 8   state              7075 non-null   object        
 9   party              7075 non-null   object        
 10  birthday           7075 non-null   datetime64[ns]
dtypes: datetime64[ns](3), int64(1), object(7)
memory usage: 921.3+ KB


In [52]:
senate['gender'] = senate['name'].map(lambda x: senate_gender_dict[x])

Let's look at what columns we have now

In [53]:
senate.columns

Index(['transaction_date', 'ticker', 'asset_description', 'type', 'amount',
       'name', 'disclosure_date', 'chamber', 'state', 'party', 'birthday',
       'gender'],
      dtype='object')

In [54]:
senate.head()

Unnamed: 0,transaction_date,ticker,asset_description,type,amount,name,disclosure_date,chamber,state,party,birthday,gender
3,2022-06-03,WFC,Wells Fargo &amp; Company Common Stock,Purchase,100001,"A. Mitchell Mcconnell, Jr.",2022-06-13,senate,KY,Rep,1942-02-20,M
4,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Partial),50001,Thomas H Tuberville,2022-06-13,senate,AL,Rep,1954-09-18,M
5,2022-05-31,X,United States Steel Corporation Common Stock <...,Sale (Full),250001,Thomas H Tuberville,2022-06-13,senate,AL,Rep,1954-09-18,M
6,2022-05-31,X,United States Steel Corporation Common Stock,Purchase,1001,Thomas H Tuberville,2022-06-13,senate,AL,Rep,1954-09-18,M
7,2022-05-20,PYPL,"PayPal Holdings, Inc. - Common Stock",Purchase,1001,Thomas H Tuberville,2022-06-13,senate,AL,Rep,1954-09-18,M


Finally, we will export the cleaned csv file.

In [55]:
senate.to_csv('datasets/cleaned_senate.csv')

___