# EDA - Chris - replaced by nathan chris data merge

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#setting up random seed for project requirements 
np.random.seed(73)

In [114]:
# reading in all members. Initial data exploration revealed '--' was used sometimes for null values.
# We also know that there is date column 'transaction_date'
senators = pd.read_csv('./data/senate_stocks.csv', index_col = 'senator', na_values = '--', 
                       parse_dates = ['transaction_date'])
reps = pd.read_csv('./data/house_stocks.csv', index_col = 'representative', na_values = '--',
                  parse_dates = ['transaction_date'])

In [115]:
list(senators.columns)

['transaction_date',
 'owner',
 'ticker',
 'asset_description',
 'asset_type',
 'type',
 'amount',
 'comment',
 'ptr_link']

In [116]:
list(reps.columns)

['disclosure_year',
 'disclosure_date',
 'transaction_date',
 'owner',
 'ticker',
 'asset_description',
 'type',
 'amount',
 'district',
 'ptr_link',
 'cap_gains_over_200_usd']

**The shared columns in both files are transaction_date, owner, ticker, asset_description, amount, ptr_link. I am not sure about asset_type and type**

In [117]:
# Check the date conversion for senators
senators.dtypes

transaction_date     datetime64[ns]
owner                        object
ticker                       object
asset_description            object
asset_type                   object
type                         object
amount                       object
comment                      object
ptr_link                     object
dtype: object

In [118]:
# Check the date conversion for reps
reps.dtypes

disclosure_year                    int64
disclosure_date                   object
transaction_date          datetime64[ns]
owner                             object
ticker                            object
asset_description                 object
type                              object
amount                            object
district                          object
ptr_link                          object
cap_gains_over_200_usd              bool
dtype: object

In [119]:
# Look at the dataframe, especially transaction_date, type, asset_type
senators.head()

Unnamed: 0_level_0,transaction_date,owner,ticker,asset_description,asset_type,type,amount,comment,ptr_link
senator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Thomas R Carper,2021-05-26,Spouse,,"HSBC Contingent Autocall BMY <div class=""text-...",Corporate Bond,Sale (Full),"$15,001 - $50,000",,https://efdsearch.senate.gov/search/view/ptr/2...
Thomas R Carper,2021-05-26,Spouse,,"HSBC Contingent Autocall BMY <div class=""text-...",Corporate Bond,Sale (Full),"$15,001 - $50,000",,https://efdsearch.senate.gov/search/view/ptr/2...
Thomas R Carper,2021-05-07,Spouse,,"CS Contingent Autocall CSCO <div class=""text-m...",Corporate Bond,Purchase,"$15,001 - $50,000",,https://efdsearch.senate.gov/search/view/ptr/2...
Mike Rounds,2021-05-01,Joint,,"Bird Dog Hospitality Fund 1, LLC <div class=""t...",Non-Public Stock,Purchase,"$15,001 - $50,000",,https://efdsearch.senate.gov/search/view/ptr/3...
Shelley M Capito,2021-04-26,Spouse,WFC,Wells Fargo &amp; Company,Stock,Sale (Partial),"$1,001 - $15,000",,https://efdsearch.senate.gov/search/view/ptr/8...


In [120]:
# Look at the dataframe, especially transaction_date, type
reps.head()

Unnamed: 0_level_0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,district,ptr_link,cap_gains_over_200_usd
representative,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Hon. Susie Lee,2020,04/28/2020,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",NV03,https://disclosures-clerk.house.gov/public_dis...,False
Hon. Susie Lee,2020,04/28/2020,2020-03-06,joint,BAH,Booz Allen Hamilton Holding Corporation,sale_full,"$1,001 - $15,000",NV03,https://disclosures-clerk.house.gov/public_dis...,True
Hon. Susie Lee,2020,04/28/2020,2020-03-23,joint,BYD,Boyd Gaming Corporation,sale_full,"$1,001 - $15,000",NV03,https://disclosures-clerk.house.gov/public_dis...,False
Hon. Susie Lee,2020,04/28/2020,2020-02-04,joint,CTLT,"Catalent, Inc.",purchase,"$1,001 - $15,000",NV03,https://disclosures-clerk.house.gov/public_dis...,False
Hon. Susie Lee,2020,04/28/2020,2020-03-18,joint,CBRE,CBRE Group Inc Common Stock Class A,sale_partial,"$1,001 - $15,000",NV03,https://disclosures-clerk.house.gov/public_dis...,False


**Conclusions**: transaction date conversion worked. Asset type is not in representative data. Type needs to be standardized, different in both dataframes, but same information.

### Reduce to columns shared between dataframes

In [121]:
senators = senators[['transaction_date', 'owner', 'ticker', 'asset_description', 'amount', 'ptr_link', 'type']]

In [122]:
senators.head()

Unnamed: 0_level_0,transaction_date,owner,ticker,asset_description,amount,ptr_link,type
senator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Thomas R Carper,2021-05-26,Spouse,,"HSBC Contingent Autocall BMY <div class=""text-...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/2...,Sale (Full)
Thomas R Carper,2021-05-26,Spouse,,"HSBC Contingent Autocall BMY <div class=""text-...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/2...,Sale (Full)
Thomas R Carper,2021-05-07,Spouse,,"CS Contingent Autocall CSCO <div class=""text-m...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/2...,Purchase
Mike Rounds,2021-05-01,Joint,,"Bird Dog Hospitality Fund 1, LLC <div class=""t...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/3...,Purchase
Shelley M Capito,2021-04-26,Spouse,WFC,Wells Fargo &amp; Company,"$1,001 - $15,000",https://efdsearch.senate.gov/search/view/ptr/8...,Sale (Partial)


In [123]:
reps = reps[['transaction_date', 'owner', 'ticker', 'asset_description', 'amount', 'ptr_link', 'type']]

In [124]:
reps.head()

Unnamed: 0_level_0,transaction_date,owner,ticker,asset_description,amount,ptr_link,type
representative,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Hon. Susie Lee,2020-02-24,joint,BHC,Bausch Health Companies Inc.,"$1,001 - $15,000",https://disclosures-clerk.house.gov/public_dis...,purchase
Hon. Susie Lee,2020-03-06,joint,BAH,Booz Allen Hamilton Holding Corporation,"$1,001 - $15,000",https://disclosures-clerk.house.gov/public_dis...,sale_full
Hon. Susie Lee,2020-03-23,joint,BYD,Boyd Gaming Corporation,"$1,001 - $15,000",https://disclosures-clerk.house.gov/public_dis...,sale_full
Hon. Susie Lee,2020-02-04,joint,CTLT,"Catalent, Inc.","$1,001 - $15,000",https://disclosures-clerk.house.gov/public_dis...,purchase
Hon. Susie Lee,2020-03-18,joint,CBRE,CBRE Group Inc Common Stock Class A,"$1,001 - $15,000",https://disclosures-clerk.house.gov/public_dis...,sale_partial


### Standardize type column

In [125]:
senators['type'].value_counts()

Purchase          4200
Sale (Full)       2064
Sale (Partial)    1735
Exchange            93
Name: type, dtype: int64

In [126]:
reps['type'].value_counts()

purchase        5114
sale_full       3283
sale_partial    1659
exchange          99
Name: type, dtype: int64

In [127]:
senators.loc[senators['type'] == 'Purchase', 'type'] = 'purchase'

In [128]:
senators.loc[senators['type'] == 'Sale (Full)', 'type'] = 'sale_full'

In [129]:
senators.loc[senators['type'] == 'Sale (Partial)', 'type'] = 'sale_partial'

In [130]:
senators.loc[senators['type'] == 'Exchange', 'type'] = 'exchange'

In [131]:
senators['type'].value_counts()

purchase        4200
sale_full       2064
sale_partial    1735
exchange          93
Name: type, dtype: int64

In [132]:
reps['type'].value_counts()

purchase        5114
sale_full       3283
sale_partial    1659
exchange          99
Name: type, dtype: int64

In [133]:
senators['transaction_date'].sort_values()

senator
Lamar Alexander     2012-06-14
Benjamin L Cardin   2012-07-25
Thomas R Carper     2012-08-02
Robert J Portman    2012-08-15
Pat Roberts         2012-08-16
                       ...    
Mike Rounds         2021-05-01
Thomas R Carper     2021-05-07
Richard M Burr      2021-05-17
Thomas R Carper     2021-05-26
Thomas R Carper     2021-05-26
Name: transaction_date, Length: 8556, dtype: datetime64[ns]

**Conclusion**: Dataset runs from June 2012 to May 2021

In [134]:
reps['transaction_date'].sort_values()

representative
Hon. Carol Devine Miller   2018-12-27
Hon. Donna Shalala         2019-01-09
Hon. Donna Shalala         2019-01-09
Hon. Donna Shalala         2019-01-09
Hon. Donna Shalala         2019-01-09
                              ...    
Hon. Mo Brooks             2021-05-14
Hon. Hakeem S. Jeffries    2021-05-17
Hon. Bob Gibbs             2021-05-25
Hon. Bob Gibbs             2021-05-25
Hon. Bob Gibbs             2021-05-25
Name: transaction_date, Length: 10155, dtype: datetime64[ns]

**Conclusion**: dataset runs from December 2018 to May 2021

In [135]:
#concatting all files

df = pd.concat([senators, reps])

In [136]:
df.head(15)

Unnamed: 0,transaction_date,owner,ticker,asset_description,amount,ptr_link,type
Thomas R Carper,2021-05-26,Spouse,,"HSBC Contingent Autocall BMY <div class=""text-...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/2...,sale_full
Thomas R Carper,2021-05-26,Spouse,,"HSBC Contingent Autocall BMY <div class=""text-...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/2...,sale_full
Thomas R Carper,2021-05-07,Spouse,,"CS Contingent Autocall CSCO <div class=""text-m...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/2...,purchase
Mike Rounds,2021-05-01,Joint,,"Bird Dog Hospitality Fund 1, LLC <div class=""t...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/3...,purchase
Shelley M Capito,2021-04-26,Spouse,WFC,Wells Fargo &amp; Company,"$1,001 - $15,000",https://efdsearch.senate.gov/search/view/ptr/8...,sale_partial
"Jerry Moran,",2020-04-01,Joint,VZ,Verizon Communications Inc. (Exchanged) <br> T...,"$1,001 - $15,000",https://efdsearch.senate.gov/search/view/ptr/1...,exchange
Richard M Burr,2021-05-17,,,This filing was disclosed via scanned PDF. Use...,Unknown,https://efdsearch.senate.gov/search/view/paper...,
Susan M Collins,2021-04-29,Spouse,,"ME St Muni Bond Bank 56045RWK1 <div class=""tex...","$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/b...,purchase
Susan M Collins,2021-04-29,Spouse,,ME St Health &amp; Higher Ed 560427VR4 <div cl...,"$50,001 - $100,000",https://efdsearch.senate.gov/search/view/ptr/b...,purchase
Tina Smith,2021-04-11,Spouse,,"Surgical Information Sciences <div class=""text...","$1,001 - $15,000",https://efdsearch.senate.gov/search/view/ptr/1...,purchase


In [137]:
df.shape

(18711, 7)

### Deal with Null and blank values

In [138]:
# columns with nulls
df.isnull().sum()

transaction_date        0
owner                5401
ticker               2995
asset_description       4
amount                  0
ptr_link                0
type                  464
dtype: int64

In [139]:
# I concluded that we should drop nulls from these columns. Not very many nulls and cannot be filled in with anything.
df = df.dropna(subset=['ticker', 'type'])
df.shape

(15716, 7)

#### Fill null values in categorical/name columns with appropriate values 

In [140]:
df['owner'].fillna("Unknown", inplace=True)

In [141]:
df['asset_description'].fillna("None", inplace=True)

In [142]:
df.isnull().sum()

transaction_date     0
owner                0
ticker               0
asset_description    0
amount               0
ptr_link             0
type                 0
dtype: int64

In [52]:
# TODO: remodel data based on problem statements

In [None]:
# TO DO LAST. MAKE SURE I HAVE DIFFERENT NAMES. 

# saving to a file for the modeling
df.to_csv('./data/cleaned_data.csv', index = False)