In [1]:
# 📚 Libraries 
import kagglehub
import pandas as pd
import numpy as np
import os

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as g

# 🤖 Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 

In [2]:
# Download latest version
path = kagglehub.dataset_download("andrewmvd/sp-500-stocks")


In [3]:
# Print all files in the dataset path
print(os.listdir(path))

['sp500_stocks.csv', 'sp500_companies.csv', 'sp500_index.csv']


In [4]:
csv_file_path = os.path.join(path, 'sp500_stocks.csv')
csv_file_path2 = os.path.join(path, 'sp500_companies.csv')
csv_file_path3 = os.path.join(path, 'sp500_index.csv')
data = pd.read_csv(csv_file_path)
df = pd.read_csv(csv_file_path2)
sp = pd.read_csv(csv_file_path3)

In [5]:
data.columns = [col.lower().replace(" ", "_")for col in data.columns] #snake_case
df.columns = [col.lower().replace(" ", "_")for col in df.columns] #snake_case
sp.columns = [col.lower().replace(" ", "_")for col in sp.columns] #snake_case

In [6]:
data2 = data.copy()

In [7]:
data2.dtypes

date          object
symbol        object
adj_close    float64
close        float64
high         float64
low          float64
open         float64
volume       float64
dtype: object

In [8]:
# Delete Columns 
data2.drop(columns=['high', 'low', 'open','close'], inplace=True)

In [9]:
data2.isna().sum()

date               0
symbol             0
adj_close    1272825
volume       1272825
dtype: int64

In [10]:
data2.dropna(how='any', inplace=True)

In [11]:
data2

Unnamed: 0,date,symbol,adj_close,volume
3759,2010-01-04,AOS,5.937269,1104600.0
3760,2010-01-05,AOS,5.861406,1207200.0
3761,2010-01-06,AOS,5.864067,663000.0
3762,2010-01-07,AOS,5.881371,564000.0
3763,2010-01-08,AOS,5.967880,504600.0
...,...,...,...,...
1868218,2024-12-03,WYNN,93.760002,3045300.0
1868219,2024-12-04,WYNN,94.790001,1958200.0
1868220,2024-12-05,WYNN,94.400002,1357800.0
1868221,2024-12-06,WYNN,94.370003,1026500.0


In [12]:
data2['date'] = pd.to_datetime(data2['date'])

In [13]:
data2.dtypes

date         datetime64[ns]
symbol               object
adj_close           float64
volume              float64
dtype: object

In [14]:
data2['year'] = data2['date'].dt.year
data2['month'] = data2['date'].dt.month
data2['day'] = data2['date'].dt.day

In [15]:
data2

Unnamed: 0,date,symbol,adj_close,volume,year,month,day
3759,2010-01-04,AOS,5.937269,1104600.0,2010,1,4
3760,2010-01-05,AOS,5.861406,1207200.0,2010,1,5
3761,2010-01-06,AOS,5.864067,663000.0,2010,1,6
3762,2010-01-07,AOS,5.881371,564000.0,2010,1,7
3763,2010-01-08,AOS,5.967880,504600.0,2010,1,8
...,...,...,...,...,...,...,...
1868218,2024-12-03,WYNN,93.760002,3045300.0,2024,12,3
1868219,2024-12-04,WYNN,94.790001,1958200.0,2024,12,4
1868220,2024-12-05,WYNN,94.400002,1357800.0,2024,12,5
1868221,2024-12-06,WYNN,94.370003,1026500.0,2024,12,6


In [16]:
data2.columns

Index(['date', 'symbol', 'adj_close', 'volume', 'year', 'month', 'day'], dtype='object')

In [17]:
cols = ['year', 'month', 'day', 'symbol', 'adj_close', 'volume']

In [18]:
data2 = data2[cols]

In [19]:
data2

Unnamed: 0,year,month,day,symbol,adj_close,volume
3759,2010,1,4,AOS,5.937269,1104600.0
3760,2010,1,5,AOS,5.861406,1207200.0
3761,2010,1,6,AOS,5.864067,663000.0
3762,2010,1,7,AOS,5.881371,564000.0
3763,2010,1,8,AOS,5.967880,504600.0
...,...,...,...,...,...,...
1868218,2024,12,3,WYNN,93.760002,3045300.0
1868219,2024,12,4,WYNN,94.790001,1958200.0
1868220,2024,12,5,WYNN,94.400002,1357800.0
1868221,2024,12,6,WYNN,94.370003,1026500.0


In [20]:
data2.dtypes

year           int32
month          int32
day            int32
symbol        object
adj_close    float64
volume       float64
dtype: object

In [38]:
df1 = data2.copy()

In [39]:
df1

Unnamed: 0,year,month,day,symbol,adj_close,volume
0,2010,1,4,MMM,43.783867,3640265.0
1,2010,1,5,MMM,43.509628,3405012.0
2,2010,1,6,MMM,44.126682,6301126.0
3,2010,1,7,MMM,44.158325,5346240.0
4,2010,1,8,MMM,44.469463,4073337.0
...,...,...,...,...,...,...
1890269,2024,12,2,ZTS,176.809998,2391500.0
1890270,2024,12,3,ZTS,176.940002,2679000.0
1890271,2024,12,4,ZTS,175.320007,2687000.0
1890272,2024,12,5,ZTS,174.770004,2442000.0


In [21]:
# Drop rows where year is between 2010 and 2013
data2.drop(data2[(data2['year'] >= 2010) & (data2['year'] <= 2013)].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2.drop(data2[(data2['year'] >= 2010) & (data2['year'] <= 2013)].index, inplace=True)


In [22]:
data2

Unnamed: 0,year,month,day,symbol,adj_close,volume
4765,2014,1,2,AOS,22.524332,1297000.0
4766,2014,1,3,AOS,22.520111,828200.0
4767,2014,1,6,AOS,22.503214,1045600.0
4768,2014,1,7,AOS,22.283543,1572200.0
4769,2014,1,8,AOS,22.030081,2465800.0
...,...,...,...,...,...,...
1868218,2024,12,3,WYNN,93.760002,3045300.0
1868219,2024,12,4,WYNN,94.790001,1958200.0
1868220,2024,12,5,WYNN,94.400002,1357800.0
1868221,2024,12,6,WYNN,94.370003,1026500.0


In [23]:
# Chat helped. 
annual_returns = data2.groupby(['symbol', 'year']).apply(lambda group: (group['adj_close'].iloc[-1] / group['adj_close'].iloc[0]) - 1).reset_index(name='annual_return')

  annual_returns = data2.groupby(['symbol', 'year']).apply(lambda group: (group['adj_close'].iloc[-1] / group['adj_close'].iloc[0]) - 1).reset_index(name='annual_return')


In [24]:
annual_returns[annual_returns['symbol'] == 'AAPL']

Unnamed: 0,symbol,year,annual_return


In [23]:
annual_returns

Unnamed: 0,symbol,year,annual_return
0,A,2010,0.323642
1,A,2011,-0.165950
2,A,2012,0.133482
3,A,2013,0.379639
4,A,2014,0.025399
...,...,...,...
7170,ZTS,2020,0.240854
7171,ZTS,2021,0.499937
7172,ZTS,2022,-0.369204
7173,ZTS,2023,0.356284


In [25]:
annual_returns[annual_returns['symbol'] == 'AAPL']

Unnamed: 0,symbol,year,annual_return


In [26]:
data2

Unnamed: 0,year,month,day,symbol,adj_close,volume
4765,2014,1,2,AOS,22.524332,1297000.0
4766,2014,1,3,AOS,22.520111,828200.0
4767,2014,1,6,AOS,22.503214,1045600.0
4768,2014,1,7,AOS,22.283543,1572200.0
4769,2014,1,8,AOS,22.030081,2465800.0
...,...,...,...,...,...,...
1868218,2024,12,3,WYNN,93.760002,3045300.0
1868219,2024,12,4,WYNN,94.790001,1958200.0
1868220,2024,12,5,WYNN,94.400002,1357800.0
1868221,2024,12,6,WYNN,94.370003,1026500.0


In [27]:
pivoted_df = annual_returns.pivot(index='symbol', columns='year', values='annual_return')

In [28]:
pivoted_df

year,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABT,0.203977,0.021440,-0.082767,0.495366,0.252859,0.270512,0.279082,0.309829,-0.197121,0.024566,0.066845
ACGL,0.030155,0.192511,0.253304,0.056817,-0.092186,0.637648,-0.168894,0.273639,0.409203,0.189081,0.334904
ACN,0.129180,0.201626,0.173500,0.341030,-0.066858,0.516653,0.262923,0.636411,-0.335772,0.319045,0.052920
ADSK,0.219492,0.023518,0.227160,0.376083,0.200616,0.422612,0.625619,-0.052722,-0.341358,0.315042,0.317914
AEE,0.351037,-0.020330,0.262635,0.161050,0.151645,0.240482,0.052848,0.200617,0.030401,-0.156572,0.265086
...,...,...,...,...,...,...,...,...,...,...,...
WEC,0.340271,-0.000642,0.181550,0.174286,0.089595,0.404543,0.042507,0.106423,0.005630,-0.073921,0.169161
WTW,0.043010,0.120953,-0.005930,0.240716,0.050027,0.367731,0.046175,0.180712,0.053805,-0.001602,0.313689
WY,0.184719,-0.136496,0.051049,0.205196,-0.355361,0.478040,0.161548,0.297072,-0.192456,0.185777,-0.060273
WYNN,-0.223783,-0.516602,0.286850,0.959390,-0.386182,0.373702,-0.207683,-0.204490,-0.060385,0.072599,0.025293


In [29]:
definitive = pd.merge (df, pivoted_df, on='symbol')
definitive

Unnamed: 0,exchange,symbol,shortname,longname,sector,industry,currentprice,marketcap,ebitda,revenuegrowth,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,NMS,NVDA,NVIDIA Corporation,NVIDIA Corporation,Technology,Semiconductors,138.81,3399456915456,6.118400e+10,1.224,...,0.664514,2.329197,0.904264,-0.328455,0.734099,1.180241,1.244762,-0.514370,2.460984,1.882606
1,NMS,GOOGL,Alphabet Inc.,Alphabet Inc.,Communication Services,Internet Content & Information,175.37,2156594987008,1.234700e+11,0.151,...,0.469191,0.043466,0.303697,-0.026323,0.269949,0.280533,0.678344,-0.391482,0.567437,0.272376
2,NMS,META,"Meta Platforms, Inc.","Meta Platforms, Inc.",Communication Services,Internet Content & Information,613.57,1548951355392,7.920900e+10,0.189,...,0.334098,0.125514,0.510012,-0.277423,0.512751,0.302126,0.250651,-0.644532,1.837582,0.777173
3,NMS,AVGO,Broadcom Inc.,Broadcom Inc.,Technology,Semiconductors,178.94,835753607168,2.295800e+10,0.164,...,0.467666,0.261397,0.468815,-0.003016,0.294436,0.420174,0.611328,-0.129964,1.062686,0.666380
4,NYQ,ORCL,Oracle Corporation,Oracle Corporation,Technology,Software - Infrastructure,190.45,527748366336,2.180300e+10,0.069,...,-0.164418,0.087702,0.241266,-0.016170,0.191585,0.220169,0.389079,-0.053944,0.278448,0.852576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,NYQ,GL,Globe Life Inc.,Globe Life Inc.,Financial Services,Insurance - Life,104.26,8752116736,1.495668e+09,0.052,...,0.062986,0.319046,0.236625,-0.167719,0.389128,-0.093395,0.022728,0.280197,0.025888,-0.137914
168,NYQ,TFX,Teleflex Incorporated,Teleflex Incorporated,Healthcare,Medical Instruments & Supplies,187.04,8686829568,6.449540e+08,0.024,...,0.158813,0.270063,0.576529,0.020244,0.497926,0.091603,-0.182178,-0.250389,0.003950,-0.248358
169,NMS,PARA,Paramount Global,Paramount Global,Communication Services,Entertainment,11.17,7971370496,3.125000e+09,-0.056,...,-0.129907,0.380478,-0.072949,-0.250807,-0.070158,-0.078440,-0.156751,-0.455216,-0.114265,-0.213401
170,NMS,CZR,"Caesars Entertainment, Inc.","Caesars Entertainment, Inc.",Consumer Cyclical,Resorts & Casinos,36.66,7789516800,3.668000e+09,-0.040,...,1.722772,0.600567,0.944282,0.144076,0.571541,0.248026,0.306285,-0.554937,0.109323,-0.232733


In [30]:
definitive

Unnamed: 0,exchange,symbol,shortname,longname,sector,industry,currentprice,marketcap,ebitda,revenuegrowth,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,NMS,NVDA,NVIDIA Corporation,NVIDIA Corporation,Technology,Semiconductors,138.81,3399456915456,6.118400e+10,1.224,...,0.664514,2.329197,0.904264,-0.328455,0.734099,1.180241,1.244762,-0.514370,2.460984,1.882606
1,NMS,GOOGL,Alphabet Inc.,Alphabet Inc.,Communication Services,Internet Content & Information,175.37,2156594987008,1.234700e+11,0.151,...,0.469191,0.043466,0.303697,-0.026323,0.269949,0.280533,0.678344,-0.391482,0.567437,0.272376
2,NMS,META,"Meta Platforms, Inc.","Meta Platforms, Inc.",Communication Services,Internet Content & Information,613.57,1548951355392,7.920900e+10,0.189,...,0.334098,0.125514,0.510012,-0.277423,0.512751,0.302126,0.250651,-0.644532,1.837582,0.777173
3,NMS,AVGO,Broadcom Inc.,Broadcom Inc.,Technology,Semiconductors,178.94,835753607168,2.295800e+10,0.164,...,0.467666,0.261397,0.468815,-0.003016,0.294436,0.420174,0.611328,-0.129964,1.062686,0.666380
4,NYQ,ORCL,Oracle Corporation,Oracle Corporation,Technology,Software - Infrastructure,190.45,527748366336,2.180300e+10,0.069,...,-0.164418,0.087702,0.241266,-0.016170,0.191585,0.220169,0.389079,-0.053944,0.278448,0.852576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,NYQ,GL,Globe Life Inc.,Globe Life Inc.,Financial Services,Insurance - Life,104.26,8752116736,1.495668e+09,0.052,...,0.062986,0.319046,0.236625,-0.167719,0.389128,-0.093395,0.022728,0.280197,0.025888,-0.137914
168,NYQ,TFX,Teleflex Incorporated,Teleflex Incorporated,Healthcare,Medical Instruments & Supplies,187.04,8686829568,6.449540e+08,0.024,...,0.158813,0.270063,0.576529,0.020244,0.497926,0.091603,-0.182178,-0.250389,0.003950,-0.248358
169,NMS,PARA,Paramount Global,Paramount Global,Communication Services,Entertainment,11.17,7971370496,3.125000e+09,-0.056,...,-0.129907,0.380478,-0.072949,-0.250807,-0.070158,-0.078440,-0.156751,-0.455216,-0.114265,-0.213401
170,NMS,CZR,"Caesars Entertainment, Inc.","Caesars Entertainment, Inc.",Consumer Cyclical,Resorts & Casinos,36.66,7789516800,3.668000e+09,-0.040,...,1.722772,0.600567,0.944282,0.144076,0.571541,0.248026,0.306285,-0.554937,0.109323,-0.232733


In [43]:
df.head(20)

Unnamed: 0,exchange,symbol,shortname,longname,sector,industry,currentprice,marketcap,ebitda,revenuegrowth,city,state,country,fulltimeemployees,longbusinesssummary,weight
0,NMS,AAPL,Apple Inc.,Apple Inc.,Technology,Consumer Electronics,242.84,3670720643072,134661000000.0,0.061,Cupertino,CA,United States,164000.0,"Apple Inc. designs, manufactures, and markets ...",0.064589
1,NMS,NVDA,NVIDIA Corporation,NVIDIA Corporation,Technology,Semiconductors,142.44,3488355713024,61184000000.0,1.224,Santa Clara,CA,United States,29600.0,NVIDIA Corporation provides graphics and compu...,0.061381
2,NMS,MSFT,Microsoft Corporation,Microsoft Corporation,Technology,Software - Infrastructure,443.57,3297889746944,136552000000.0,0.16,Redmond,WA,United States,228000.0,Microsoft Corporation develops and supports so...,0.058029
3,NMS,AMZN,"Amazon.com, Inc.","Amazon.com, Inc.",Consumer Cyclical,Internet Retail,227.03,2387220627456,111583000000.0,0.11,Seattle,WA,United States,1551000.0,"Amazon.com, Inc. engages in the retail sale of...",0.042005
4,NMS,GOOG,Alphabet Inc.,Alphabet Inc.,Communication Services,Internet Content & Information,176.49,2149065949184,123470000000.0,0.151,Mountain View,CA,United States,181269.0,Alphabet Inc. offers various products and plat...,0.037815
5,NMS,GOOGL,Alphabet Inc.,Alphabet Inc.,Communication Services,Internet Content & Information,174.71,2147972284416,123470000000.0,0.151,Mountain View,CA,United States,181269.0,Alphabet Inc. offers various products and plat...,0.037795
6,NMS,META,"Meta Platforms, Inc.","Meta Platforms, Inc.",Communication Services,Internet Content & Information,623.77,1574701105152,79209000000.0,0.189,Menlo Park,CA,United States,72404.0,"Meta Platforms, Inc. engages in the developmen...",0.027708
7,NMS,TSLA,"Tesla, Inc.","Tesla, Inc.",Consumer Cyclical,Auto Manufacturers,389.22,1249419591680,13244000000.0,0.078,Austin,TX,United States,140473.0,"Tesla, Inc. designs, develops, manufactures, l...",0.021985
8,NYQ,BRK-B,Berkshire Hathaway Inc. New,Berkshire Hathaway Inc.,Financial Services,Insurance - Diversified,470.5,1013791064064,149547000000.0,-0.002,Omaha,NE,United States,396500.0,"Berkshire Hathaway Inc., through its subsidiar...",0.017839
9,NMS,AVGO,Broadcom Inc.,Broadcom Inc.,Technology,Semiconductors,179.53,838509264896,22958000000.0,0.164,Palo Alto,CA,United States,20000.0,"Broadcom Inc. designs, develops, and supplies ...",0.014754


In [31]:
annual_returns.describe()

Unnamed: 0,year,annual_return
count,1848.0,1848.0
mean,2019.061688,0.170192
std,3.156019,0.327395
min,2014.0,-0.703792
25%,2016.0,-0.022033
50%,2019.0,0.149572
75%,2022.0,0.325165
max,2024.0,3.370326
