Import libraries


In [2]:
# Analysis modules
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

### Load data


In [23]:
path_to_data  = '../../data/OriginalData.csv'
data = pd.read_csv(path_to_data)

print(data.info())

print('\nKEYS\n----')
for i in data.keys():
    print(i)
print()

display(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15748 entries, 0 to 15747
Data columns (total 49 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Dates                             15748 non-null  object 
 1   symbol                            15748 non-null  object 
 2   PX_LAST                           15748 non-null  float64
 3   Dividend_Per_Share                15748 non-null  float64
 4   TOTAL_EQUITY                      15748 non-null  float64
 5   CUR_MKT_CAP                       15748 non-null  float64
 6   gics_sector_name                  15748 non-null  object 
 7   gics_sub_industry_name            15748 non-null  object 
 8   gics_industry_name                15748 non-null  object 
 9   gics_industry_group_name          15748 non-null  object 
 10  gics_industry                     15748 non-null  int64  
 11  gics_sector                       15748 non-null  int64  
 12  Year

Unnamed: 0,Dates,symbol,PX_LAST,Dividend_Per_Share,TOTAL_EQUITY,CUR_MKT_CAP,gics_sector_name,gics_sub_industry_name,gics_industry_name,gics_industry_group_name,...,NUM_CEOS_EQUIV_INC_COMP_PD,CEO_TENURE_AS_OF_FY_END,TOT_COMP_AW_TO_CFO_EQUIV,TOT_COMP_AW_TO_COO_EQUIV,TOTAL_BOD_COMPENSATION_AWARDED,TOTAL_BOD_FEES_PAID_IN_CASH,TOTAL_BOD_STOCK_AWARDS_GIVEN,NUMBER_OF_BOD_CHANGES_DURING_FY,DATE_OF_LAST_BOD_CHANGE,NUM_DIR_INC_IN_CMPNSTN_PD
0,2014-04-11,AAPL,18.558,0.1089,120179.0,463486.4114,Information Technology,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals",Technology Hardware & Equipment,...,1,3.75,18520521.0,0,2638340.0,697500.0,1906645.0,2,2014-07-17,8
1,2014-04-11,AMT,80.55,0.32,3701.605,31870.1921,Real Estate,Telecom Tower REITs,Specialized REITs,Equity Real Estate Investment Trusts (REITs),...,1,11.25,4904866.0,0,1975088.0,775000.0,600071.0,1,2014-12-18,9
2,2014-04-11,AMZN,15.587,0.0,10328.0,143440.4076,Consumer Discretionary,Broadline Retail,Broadline Retail,Consumer Discretionary Distribution & Retail,...,1,18.667,5962307.0,0,797000.0,0.0,797000.0,1,2014-07-31,9
3,2014-04-11,BHP,58.8727,0.0,82279.0,180544.3056,Materials,Diversified Metals & Mining,Metals & Mining,Materials,...,1,1.1667,4639000.0,0,4695000.0,3617000.0,0.0,1,2014-04-15,13
4,2014-04-11,CAT,101.45,0.6,20369.0,64707.075,Industrials,Construction Machinery & Heavy Transportation ...,Machinery,Capital Goods,...,1,4.5,5286968.0,0,3206983.0,1714174.0,1375110.0,1,2014-12-31,11


### Produce subset dataframe

In [24]:
# Extract these keys from the original data set to produce a smaller set containing only what we need
relevant_data_keys = ['Dates', 
                 'Year', 
                 'symbol',
                 'PX_LAST', 
                 'gics_sector_name', 
                 'DATE_OF_LAST_EXECUTIVE_CHANGE',
                 'TOT_STK_AWD_GIVEN_TO_CEO_EQUIV',
                 'TOT_OPT_AWD_GIVEN_TO_CEO_EQUIV',
                 'TOT_BONUSES_PAID_TO_CEO_EQUIV',
                 'TOT_N_EQT_INCENT_GVN_TO_CEO_EQ']

# Produce a subset dataframe containing only the necessary data
subset_data = data[relevant_data_keys].copy()

# Cast both dates columns to datetime64[ns]
# 'Dates', 'DATE_OF_LAST_EXECUTIVE_CHANGE'
subset_data['Dates'] = pd.to_datetime(subset_data['Dates'])
subset_data['DATE_OF_LAST_EXECUTIVE_CHANGE'] = pd.to_datetime(subset_data['DATE_OF_LAST_EXECUTIVE_CHANGE'])

subset_data.info()
subset_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15748 entries, 0 to 15747
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Dates                           15748 non-null  datetime64[ns]
 1   Year                            15748 non-null  int64         
 2   symbol                          15748 non-null  object        
 3   PX_LAST                         15748 non-null  float64       
 4   gics_sector_name                15748 non-null  object        
 5   DATE_OF_LAST_EXECUTIVE_CHANGE   15748 non-null  datetime64[ns]
 6   TOT_STK_AWD_GIVEN_TO_CEO_EQUIV  15748 non-null  float64       
 7   TOT_OPT_AWD_GIVEN_TO_CEO_EQUIV  15748 non-null  int64         
 8   TOT_BONUSES_PAID_TO_CEO_EQUIV   15748 non-null  int64         
 9   TOT_N_EQT_INCENT_GVN_TO_CEO_EQ  15748 non-null  int64         
dtypes: datetime64[ns](2), float64(2), int64(4), object(2)
memory usage: 1.

Unnamed: 0,Dates,Year,symbol,PX_LAST,gics_sector_name,DATE_OF_LAST_EXECUTIVE_CHANGE,TOT_STK_AWD_GIVEN_TO_CEO_EQUIV,TOT_OPT_AWD_GIVEN_TO_CEO_EQUIV,TOT_BONUSES_PAID_TO_CEO_EQUIV,TOT_N_EQT_INCENT_GVN_TO_CEO_EQ
0,2014-04-11,2014,AAPL,18.5580,Information Technology,2014-05-29,0.0,0,0,6700000
1,2014-04-11,2014,AMT,80.5500,Real Estate,2013-12-31,4500051.0,4500007,0,2574000
2,2014-04-11,2014,AMZN,15.5870,Consumer Discretionary,2006-04-30,0.0,0,0,0
3,2014-04-11,2014,BHP,58.8727,Materials,2014-03-03,3338000.0,0,1568000,0
4,2014-04-11,2014,CAT,101.4500,Industrials,2013-01-01,0.0,8377481,0,4913288
...,...,...,...,...,...,...,...,...,...,...
15743,2023-12-29,2023,UNH,526.4700,Health Care,2022-03-01,15000970.0,5000114,0,1800000
15744,2023-12-29,2023,V,260.3500,Financials,2023-08-01,28004734.0,10707531,0,10312500
15745,2023-12-29,2023,WFC,49.2200,Financials,2020-10-01,16634168.0,0,0,6625000
15746,2023-12-29,2023,WMT,52.5500,Consumer Staples,2022-06-06,30103720.0,0,0,4627582


### Isolate data for processing

In [27]:
# Create mini dataframe viewing just one company
aapl_df = subset_data[subset_data['symbol'] == 'AAPL']

display(aapl_df)

Unnamed: 0,Dates,Year,symbol,PX_LAST,gics_sector_name,DATE_OF_LAST_EXECUTIVE_CHANGE,TOT_STK_AWD_GIVEN_TO_CEO_EQUIV,TOT_OPT_AWD_GIVEN_TO_CEO_EQUIV,TOT_BONUSES_PAID_TO_CEO_EQUIV,TOT_N_EQT_INCENT_GVN_TO_CEO_EQ
0,2014-04-11,2014,AAPL,18.558,Information Technology,2014-05-29,0.0,0,0,6700000
31,2014-04-18,2014,AAPL,18.748,Information Technology,2014-05-29,0.0,0,0,6700000
62,2014-04-25,2014,AAPL,20.426,Information Technology,2014-05-29,0.0,0,0,6700000
93,2014-05-02,2014,AAPL,21.164,Information Technology,2014-05-29,0.0,0,0,6700000
124,2014-05-09,2014,AAPL,20.912,Information Technology,2014-05-29,0.0,0,0,6700000
...,...,...,...,...,...,...,...,...,...,...
15593,2023-12-01,2023,AAPL,191.240,Information Technology,2019-02-05,46970283.0,0,0,10713450
15624,2023-12-08,2023,AAPL,195.710,Information Technology,2019-02-05,46970283.0,0,0,10713450
15655,2023-12-15,2023,AAPL,197.570,Information Technology,2019-02-05,46970283.0,0,0,10713450
15686,2023-12-22,2023,AAPL,193.600,Information Technology,2019-02-05,46970283.0,0,0,10713450
