### Notebook that takes our preliminary dataset, and using the trailing 1 month returns creates several options to use in a Supervised Learning project.

In [1]:
#Import Libraries
import pandas as pd
import numpy as np

In [2]:
#Load preliminary data
#csv_data = pd.read_csv("assets\SP500_Index_Data_partial_clean.csv")
csv_data = pd.read_csv('SP500_Index_Data_partial_clean.csv')

csv_data

Unnamed: 0,Date,Ticker,Name,Sector,USD_FF_mktcap,Price,Local_Returns_12m,Local_Returns_1m,Trail_DivYld,PB,Trail_EV_EBITDA,Trail_PE,Trail_EPS,Trail3yrAvg_EPSgro,Trail3yrAvg_DPSgro,Volatility,Debt_to_MktCap,NetDebt_EBITDA
0,2021-06-30,LYB,LYONDELLBASELL-A,Materials,27043.337699,102.8700,0.639834,-0.077692,4.1606,3.9080,13.3578,13.4336,7.6577,16.9882,4.0447,40.193,0.4833,3.9745
1,2021-06-30,AXP,AMERICAN EXPRESS,Financials,99744.800390,165.2300,0.762142,0.031849,1.0410,5.4268,11.4500,27.1314,8.6000,165.5851,7.8788,34.862,0.3285,0.3249
2,2021-06-30,VZ,VERIZON COMMUNIC,Communication Services,223036.294032,56.0300,0.061385,-0.008143,4.4574,3.1481,8.6097,10.4378,5.3680,10.8359,2.0861,15.239,0.7723,3.6833
3,2021-06-30,AVGO,BROADCOM INC,Information Technology,188117.287931,476.8400,0.561110,0.017371,2.9465,9.6650,17.2061,40.4239,11.7960,23.7943,30.8588,30.250,0.2067,2.3997
4,2021-06-30,BA,BOEING CO/THE,Industrials,139920.017166,239.5600,0.306928,-0.030200,0.0000,,,,-8.1120,-51.6859,0.0000,46.282,0.4538,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123863,2000-12-31,MDR,MCDERMOTT INTL,Energy,649.884800,,0.198454,0.194444,2.7033,0.4328,12.0836,20.5509,,,-9.1440,68.622,0.6455,3.8149
123864,2000-12-31,EHC,ENCOMPASS HEALTH,Health Care,6301.807300,81.5625,2.034884,0.170404,0.0000,,,,,,0.0000,68.515,,
123865,2000-12-31,AAMRQ,AMR CORP,Industrials,5905.137000,39.1875,0.307889,0.171963,0.0000,0.8304,,8.4821,4.6200,-46.7452,0.0000,51.599,1.0618,1.5629
123866,2000-12-31,CMCSK,COMCAST CORP-SPL,Communication Services,37681.138500,27.8333,-0.174289,0.086179,0.0000,2.7026,19.0587,2065.6250,-0.0067,,0.0000,59.400,0.2869,2.8742


In [3]:
### Create the FORWARD/FUTURE labels we are trying to predict
### Normalized (z-scored) return, deciled return, quintiled return

df = csv_data.copy()
columns = ['Date','Ticker','Name','Sector','Local_Returns_1m']
df = df[columns]
#display(df.head())
df = df.rename(mapper={'Local_Returns_1m':'PRIOR_Returns_1m'} ,axis=1)
df = df.sort_values(by='Date', ascending=True)
df['FUTURE_Returns_1m'] = df.groupby(['Ticker'])['PRIOR_Returns_1m'].shift(1)

### Remove NaN's from FUTURE_RETURNS, as this is our target labels dataset
print('Rows prior to dropping NaNs from FUTURE_RETURNS:', len(df))
before = len(df)
df = df.dropna().reset_index(drop=True)
print('Rows after to dropping NaNs from FUTURE_RETURNS:', len(df))
after = len(df)
print('Lost Rows after to dropping NaNs from FUTURE_RETURNS:', after-before)
#display(df)

### Normalized (z-score Scaled) Future Returns
df['FUTURE_Returns_zscore_overall'] = df.groupby(['Date'])['FUTURE_Returns_1m'].transform( 
    lambda x: (x - np.mean(x)) / np.std(x) )
df['FUTURE_Returns_zscore_bySector'] = df.groupby(['Date','Sector'])['FUTURE_Returns_1m'].transform( 
    lambda x: (x - np.mean(x)) / np.std(x) )

### Normalized (Min Max Scaled) Future Returns
df['FUTURE_Returns_MinMax_overall'] = df.groupby(['Date'])['FUTURE_Returns_1m'].transform( 
    lambda x: (x - np.min(x)) / (np.max(x) -np.min(x)) )
df['FUTURE_Returns_MinMax_bySector'] = df.groupby(['Date','Sector'])['FUTURE_Returns_1m'].transform( 
    lambda x: (x - np.min(x)) / (np.max(x) -np.min(x)) )

### Quintiles of Future Returns  (Higher Rank means Higher Return)
q = 5
df['FUTURE_Returns_quintile_overall'] = df.groupby(['Date'])['FUTURE_Returns_1m'].transform( 
    lambda x: pd.qcut(x, q, labels=False, duplicates='drop'))      ### OLD NOTES #.transform( lambda x: pd.qcut(x, 5, labels=['Quant5','Quant4','Quant3','Quant2','Quant1'])) #labels=[5.0,4.0,3.0,2.0,1.0]))  #labels=(range(1,6)) #duplicates='drop' #better handles instances where there are a lot of 0's like div yld and div growth
df['FUTURE_Returns_quintile_bySector'] = df.groupby(['Date','Sector'])['FUTURE_Returns_1m'].transform( 
    lambda x: pd.qcut(x, q, labels=False, duplicates='drop'))

d={0:0, 1:2, 2:2, 3:2, 4:4}
df['FUTURE_Returns_top_bot_100_overall']=df.FUTURE_Returns_quintile_overall.map(d)
df['FUTURE_Returns_top_bot_100_bySector']=df.FUTURE_Returns_quintile_bySector.map(d)

### Deciles of Future Returns  (Higher Rank means Higher Return)
q = 10
df['FUTURE_Returns_decile_overall'] = df.groupby(['Date'])['FUTURE_Returns_1m'].transform( 
    lambda x: pd.qcut(x, q, labels=False, duplicates='drop'))      ### OLD NOTES #.transform( lambda x: pd.qcut(x, 5, labels=['Quant5','Quant4','Quant3','Quant2','Quant1'])) #labels=[5.0,4.0,3.0,2.0,1.0]))  #labels=(range(1,6)) #duplicates='drop' #better handles instances where there are a lot of 0's like div yld and div growth
df['FUTURE_Returns_decile_bySector'] = df.groupby(['Date','Sector'])['FUTURE_Returns_1m'].transform( 
    lambda x: pd.qcut(x, q, labels=False, duplicates='drop'))

d1={0:0, 1:5, 2:5, 3:5, 4:5, 5:5, 6:5, 7:5, 8:5, 9:10}
df['FUTURE_Returns_top_bot_50_overall']=df.FUTURE_Returns_decile_overall.map(d1)
df['FUTURE_Returns_top_bot_50_bySector']=df.FUTURE_Returns_decile_bySector.map(d1)

#df = df[(df.Ticker=='IBM') | (df.Ticker=='AMZN')]    
df

Rows prior to dropping NaNs from FUTURE_RETURNS: 123868
Rows after to dropping NaNs from FUTURE_RETURNS: 122467
Lost Rows after to dropping NaNs from FUTURE_RETURNS: -1401


Unnamed: 0,Date,Ticker,Name,Sector,PRIOR_Returns_1m,FUTURE_Returns_1m,FUTURE_Returns_zscore_overall,FUTURE_Returns_zscore_bySector,FUTURE_Returns_MinMax_overall,FUTURE_Returns_MinMax_bySector,FUTURE_Returns_quintile_overall,FUTURE_Returns_quintile_bySector,FUTURE_Returns_top_bot_100_overall,FUTURE_Returns_top_bot_100_bySector,FUTURE_Returns_decile_overall,FUTURE_Returns_decile_bySector,FUTURE_Returns_top_bot_50_overall,FUTURE_Returns_top_bot_50_bySector
0,2001-01-31,ROH,ROHM AND HAAS CO,Materials,-0.011360,0.220588,1.106160,0.948310,0.532108,0.664444,4.0,4.0,4,4.0,8.0,8.0,5,5.0
1,2001-01-31,0226226D,CENTEX LLC,Consumer Discretionary,0.087255,0.062952,-0.031521,-0.117140,0.412020,0.474688,2.0,1.0,2,2.0,4.0,3.0,5,5.0
2,2001-01-31,MSI,MOTOROLA SOLUTIO,Information Technology,0.126420,0.011352,-0.403920,0.245118,0.372712,0.566630,1.0,3.0,2,2.0,2.0,6.0,5,5.0
3,2001-01-31,WYE,WYETH LLC,Health Care,-0.070024,0.056965,-0.074728,-0.054685,0.407459,0.319632,2.0,2.0,2,2.0,4.0,5.0,5,5.0
4,2001-01-31,MMC,MARSH & MCLENNAN,Financials,-0.071524,0.016287,-0.368306,-0.912410,0.376471,0.076315,1.0,0.0,2,0.0,3.0,1.0,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122462,2021-06-30,ROK,ROCKWELL AUTOMAT,Industrials,0.084559,0.001949,-0.301005,-0.633877,0.383970,0.367202,1.0,1.0,2,2.0,3.0,2.0,5,5.0
122463,2021-06-30,KHC,KRAFT HEINZ CO/T,Consumer Staples,-0.064464,0.065427,0.791089,1.553389,0.524626,1.000000,4.0,4.0,4,4.0,8.0,9.0,5,10.0
122464,2021-06-30,AMT,AMERICAN TOWER C,Real Estate,0.062448,0.002708,-0.287941,-0.408930,0.385653,0.409444,1.0,1.0,2,2.0,3.0,3.0,5,5.0
122465,2021-06-30,DVA,DAVITA INC,Health Care,0.002998,0.030378,0.188105,0.266416,0.446965,0.469444,2.0,3.0,2,2.0,5.0,6.0,5,5.0


In [4]:
df.isna().sum()

Date                                   0
Ticker                                 0
Name                                   0
Sector                                 0
PRIOR_Returns_1m                       0
FUTURE_Returns_1m                      0
FUTURE_Returns_zscore_overall          0
FUTURE_Returns_zscore_bySector         4
FUTURE_Returns_MinMax_overall          0
FUTURE_Returns_MinMax_bySector         4
FUTURE_Returns_quintile_overall        0
FUTURE_Returns_quintile_bySector       4
FUTURE_Returns_top_bot_100_overall     0
FUTURE_Returns_top_bot_100_bySector    4
FUTURE_Returns_decile_overall          0
FUTURE_Returns_decile_bySector         4
FUTURE_Returns_top_bot_50_overall      0
FUTURE_Returns_top_bot_50_bySector     4
dtype: int64

Sanity Checks, Checks for NA's

In [5]:
## Check for any NA's 
df[df.isna().any(axis=1)]
### Looks like only 4 entries, each is the same company in the Real Estate Sector
### We suspect this is because there was only 1 stock in the Real Estate Sector at that time

Unnamed: 0,Date,Ticker,Name,Sector,PRIOR_Returns_1m,FUTURE_Returns_1m,FUTURE_Returns_zscore_overall,FUTURE_Returns_zscore_bySector,FUTURE_Returns_MinMax_overall,FUTURE_Returns_MinMax_bySector,FUTURE_Returns_quintile_overall,FUTURE_Returns_quintile_bySector,FUTURE_Returns_top_bot_100_overall,FUTURE_Returns_top_bot_100_bySector,FUTURE_Returns_decile_overall,FUTURE_Returns_decile_bySector,FUTURE_Returns_top_bot_50_overall,FUTURE_Returns_top_bot_50_bySector
4396,2001-09-30,WY,WEYERHAEUSER CO,Real Estate,-0.141674,-0.043441,-0.012677,,0.449166,,2.0,,2,,4.0,,5,
4891,2001-10-31,WY,WEYERHAEUSER CO,Real Estate,0.032847,-0.141674,-0.122327,,0.530812,,1.0,,2,,3.0,,5,
5389,2001-11-30,WY,WEYERHAEUSER CO,Real Estate,0.058906,0.032847,-0.058652,,0.447175,,2.0,,2,,5.0,,5,
5834,2001-12-31,WY,WEYERHAEUSER CO,Real Estate,0.023273,0.058906,-0.376671,,0.4162,,1.0,,2,,3.0,,5,


In [6]:
### We suspect this is because there was only 1 stock in the Real Estate Sector at that time
df[(df.Date=='2001-09-30') & (df.Sector=='Real Estate')]
### This is confirmed

Unnamed: 0,Date,Ticker,Name,Sector,PRIOR_Returns_1m,FUTURE_Returns_1m,FUTURE_Returns_zscore_overall,FUTURE_Returns_zscore_bySector,FUTURE_Returns_MinMax_overall,FUTURE_Returns_MinMax_bySector,FUTURE_Returns_quintile_overall,FUTURE_Returns_quintile_bySector,FUTURE_Returns_top_bot_100_overall,FUTURE_Returns_top_bot_100_bySector,FUTURE_Returns_decile_overall,FUTURE_Returns_decile_bySector,FUTURE_Returns_top_bot_50_overall,FUTURE_Returns_top_bot_50_bySector
4396,2001-09-30,WY,WEYERHAEUSER CO,Real Estate,-0.141674,-0.043441,-0.012677,,0.449166,,2.0,,2,,4.0,,5,


In [7]:
df.isna().sum()

Date                                   0
Ticker                                 0
Name                                   0
Sector                                 0
PRIOR_Returns_1m                       0
FUTURE_Returns_1m                      0
FUTURE_Returns_zscore_overall          0
FUTURE_Returns_zscore_bySector         4
FUTURE_Returns_MinMax_overall          0
FUTURE_Returns_MinMax_bySector         4
FUTURE_Returns_quintile_overall        0
FUTURE_Returns_quintile_bySector       4
FUTURE_Returns_top_bot_100_overall     0
FUTURE_Returns_top_bot_100_bySector    4
FUTURE_Returns_decile_overall          0
FUTURE_Returns_decile_bySector         4
FUTURE_Returns_top_bot_50_overall      0
FUTURE_Returns_top_bot_50_bySector     4
dtype: int64

In [8]:
### Before exporting, drop the remaining na's
df = df.dropna().reset_index(drop=True)
#df

In [9]:
df.isna().sum()

Date                                   0
Ticker                                 0
Name                                   0
Sector                                 0
PRIOR_Returns_1m                       0
FUTURE_Returns_1m                      0
FUTURE_Returns_zscore_overall          0
FUTURE_Returns_zscore_bySector         0
FUTURE_Returns_MinMax_overall          0
FUTURE_Returns_MinMax_bySector         0
FUTURE_Returns_quintile_overall        0
FUTURE_Returns_quintile_bySector       0
FUTURE_Returns_top_bot_100_overall     0
FUTURE_Returns_top_bot_100_bySector    0
FUTURE_Returns_decile_overall          0
FUTURE_Returns_decile_bySector         0
FUTURE_Returns_top_bot_50_overall      0
FUTURE_Returns_top_bot_50_bySector     0
dtype: int64

In [10]:
### Three are probably better stats to summarize this set of 'sanity checks', but this just gives a reasonable check

print('Number of y-variables', len(df))
print('Number of different methods to quantify our y-variable', len(df.columns)-len(['Date','Ticker','Name','Sector']))
print('**************')
print('If we Normalize (by z-score) across the universe, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df.FUTURE_Returns_zscore_overall) / len(df.Date.unique())  ) )
print('If we Normalize (by z-score) by Sector, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df.FUTURE_Returns_zscore_bySector) / len(df.Date.unique())  ) )
print('**************')
print('If we Normalize (by Min Max) across the universe, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df.FUTURE_Returns_MinMax_overall) / len(df.Date.unique())  ) )
print('If we Normalize (by Min Max) by Sector, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df.FUTURE_Returns_MinMax_bySector) / len(df.Date.unique())  ) )
print('**************')
print('If we Quintile across the universe, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df[df.FUTURE_Returns_quintile_overall==1]) / len(df.Date.unique()) ) )
print('If we Quintile by Sector, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df[df.FUTURE_Returns_quintile_bySector==1]) / len(df.Date.unique()) ) )
print('***********')
print('If we Decile across the universe, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df[df.FUTURE_Returns_decile_overall==1]) / len(df.Date.unique()) ) )
print('If we Decile by Sector, each quintile label would have *roughly* {} stocks for each label'.format( 
    len(df[df.FUTURE_Returns_decile_bySector==1]) / len(df.Date.unique()) ) )

Number of y-variables 122463
Number of different methods to quantify our y-variable 14
**************
If we Normalize (by z-score) across the universe, each quintile label would have *roughly* 497.8170731707317 stocks for each label
If we Normalize (by z-score) by Sector, each quintile label would have *roughly* 497.8170731707317 stocks for each label
**************
If we Normalize (by Min Max) across the universe, each quintile label would have *roughly* 497.8170731707317 stocks for each label
If we Normalize (by Min Max) by Sector, each quintile label would have *roughly* 497.8170731707317 stocks for each label
**************
If we Quintile across the universe, each quintile label would have *roughly* 99.369918699187 stocks for each label
If we Quintile by Sector, each quintile label would have *roughly* 97.3780487804878 stocks for each label
***********
If we Decile across the universe, each quintile label would have *roughly* 49.77235772357724 stocks for each label
If we Decile by 

Export to Google Drive

In [11]:
# export to Google Drive
# df.to_csv("assets\SP500_Supervised_Y_Variable_Options.csv", index=False)
# df.to_csv("/content/gdrive/Shareddrives/Milestone2/SP500_Supervised_Y_Variable_Options.csv", index=False)

To do further views of the y-variable options, please put below this cell


In [12]:
df.isna().sum()

Date                                   0
Ticker                                 0
Name                                   0
Sector                                 0
PRIOR_Returns_1m                       0
FUTURE_Returns_1m                      0
FUTURE_Returns_zscore_overall          0
FUTURE_Returns_zscore_bySector         0
FUTURE_Returns_MinMax_overall          0
FUTURE_Returns_MinMax_bySector         0
FUTURE_Returns_quintile_overall        0
FUTURE_Returns_quintile_bySector       0
FUTURE_Returns_top_bot_100_overall     0
FUTURE_Returns_top_bot_100_bySector    0
FUTURE_Returns_decile_overall          0
FUTURE_Returns_decile_bySector         0
FUTURE_Returns_top_bot_50_overall      0
FUTURE_Returns_top_bot_50_bySector     0
dtype: int64

In [13]:
len(df)

122463