In [1]:
# 📚 Libraries 
import kagglehub
import pandas as pd
import numpy as np
import os

# New liabraries. 
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.formula.api as smf

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as g

# 🤖 Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 

In [2]:
# Download latest version
#path = kagglehub.dataset_download("andrewmvd/sp-500-stocks")


In [3]:
# Print all files in the dataset path
#print(os.listdir(path))

In [2]:
#csv_file_path = os.path.join(path, 'sp500_stocks.csv')
#csv_file_path2 = os.path.join(path, 'sp500_companies.csv')
#csv_file_path3 = os.path.join(path, 'sp500_index.csv')
#data = pd.read_csv(csv_file_path)
#df = pd.read_csv(csv_file_path2)
#sp = pd.read_csv(csv_file_path3)

In [2]:
data = pd.read_csv('sp500_stocks.csv')
df = pd.read_csv('sp500_companies.csv')
sp = pd.read_csv('sp500_index.csv')

In [3]:
data.columns = [col.lower().replace(" ", "_")for col in data.columns] #snake_case
df.columns = [col.lower().replace(" ", "_")for col in df.columns] #snake_case
sp.columns = [col.lower().replace(" ", "_")for col in sp.columns] #snake_case

In [4]:
# Copies 
data2 = data.copy()
df2 = df.copy()
sp2 = sp.copy()

In [5]:
data2.dtypes

date          object
symbol        object
adj_close    float64
close        float64
high         float64
low          float64
open         float64
volume       float64
dtype: object

In [6]:
data.isna().sum()

date              0
symbol            0
adj_close    101626
close        101626
high         101626
low          101626
open         101626
volume       101626
dtype: int64

In [7]:
# Delete Columns 
data2.drop(columns=['high', 'low', 'open','close'], inplace=True)

In [8]:
data2.isna().sum()

date              0
symbol            0
adj_close    101626
volume       101626
dtype: int64

In [9]:
# Delete NaN. TELL WHY I'M dropping 
data2.dropna(how='any', inplace=True)

In [10]:
data2.isna().sum()

date         0
symbol       0
adj_close    0
volume       0
dtype: int64

In [11]:
# Change to datetime. 
data2['date'] = pd.to_datetime(data2['date'])
# Change to datetime. SP 500. 
sp['date'] = pd.to_datetime(sp['date'])

In [12]:
data2['year'] = data2['date'].dt.year
data2['month'] = data2['date'].dt.month
data2['day'] = data2['date'].dt.day
sp['year'] = sp['date'].dt.year
sp['month'] = sp['date'].dt.month
sp['day'] = sp['date'].dt.day

In [13]:
cols = ['year', 'month', 'day', 'symbol', 'adj_close', 'volume']
data2 = data2[cols]
data2.head(3)

Unnamed: 0,year,month,day,symbol,adj_close,volume
0,2010,1,4,MMM,43.783867,3640265.0
1,2010,1,5,MMM,43.509628,3405012.0
2,2010,1,6,MMM,44.126682,6301126.0


In [14]:
# Drop rows where year is between 2010 and 2013 because SP500 for comparison we do have 2014. 
data2.drop(data2[(data2['year'] >= 2010) & (data2['year'] <= 2014)].index, inplace=True)
# Drop row for sp 500 as we do not have many. 
sp.drop(sp[sp['year'] == 2014].index, inplace=True)

In [15]:
# Chat helped. 
annual_returns = data2.groupby(['symbol', 'year']).apply(lambda group: (group['adj_close'].iloc[-1] / group['adj_close'].iloc[0]) - 1).reset_index(name='annual_return').round(4)

  annual_returns = data2.groupby(['symbol', 'year']).apply(lambda group: (group['adj_close'].iloc[-1] / group['adj_close'].iloc[0]) - 1).reset_index(name='annual_return').round(4)


In [16]:
pivoted_df = annual_returns.pivot(index='symbol', columns='year', values='annual_return')
pivoted_df.sample(3)

year,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DIS,0.1346,0.0272,0.0292,-0.0037,0.3435,0.2225,-0.1283,-0.4458,0.0181,0.2928
K,0.1374,0.0572,-0.0448,-0.1331,0.269,-0.0469,0.088,0.1389,-0.1318,0.4455
CHTR,0.1002,0.6218,0.1756,-0.1834,0.6844,0.3489,0.0076,-0.4764,0.1379,0.0295


In [17]:
cols = ['year', 'month', 'day', 's&p500']
sp = sp[cols]
sp.head(3)

Unnamed: 0,year,month,day,s&p500
17,2015,1,2,2058.2
18,2015,1,5,2020.58
19,2015,1,6,2002.61


In [18]:
annual_returns_sp = sp.groupby('year').apply(lambda group: (group['s&p500'].iloc[-1] / group['s&p500'].iloc[0]) - 1).reset_index(name='annual_return').round(4)

  annual_returns_sp = sp.groupby('year').apply(lambda group: (group['s&p500'].iloc[-1] / group['s&p500'].iloc[0]) - 1).reset_index(name='annual_return').round(4)


In [19]:
annual_returns_sp

Unnamed: 0,year,annual_return
0,2015,-0.0069
1,2016,0.1124
2,2017,0.1842
3,2018,-0.0701
4,2019,0.2871
5,2020,0.1529
6,2021,0.2879
7,2022,-0.1995
8,2023,0.2473
9,2024,0.2809


In [20]:
annual_returns_sp['symbol'] = 's&p500'

In [21]:
annual_returns_sp

Unnamed: 0,year,annual_return,symbol
0,2015,-0.0069,s&p500
1,2016,0.1124,s&p500
2,2017,0.1842,s&p500
3,2018,-0.0701,s&p500
4,2019,0.2871,s&p500
5,2020,0.1529,s&p500
6,2021,0.2879,s&p500
7,2022,-0.1995,s&p500
8,2023,0.2473,s&p500
9,2024,0.2809,s&p500


In [22]:
cols = ['symbol', 'year', 'annual_return']
annual_returns_sp = annual_returns_sp[cols]

In [23]:
pivoted_sp = annual_returns_sp.pivot(index='symbol',columns='year', values='annual_return')
pivoted_sp

year,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
s&p500,-0.0069,0.1124,0.1842,-0.0701,0.2871,0.1529,0.2879,-0.1995,0.2473,0.2809


In [24]:
defi = pd.concat([pivoted_df,pivoted_sp], axis=0) 
defi

year,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,0.0418,0.1319,0.4535,0.0072,0.3098,0.3875,0.3512,-0.0374,-0.0666,0.0183
AAPL,-0.0208,0.1238,0.4804,-0.0705,0.8874,0.7824,0.3806,-0.2820,0.5480,0.3145
ABBV,-0.0714,0.1290,0.6061,-0.0267,0.0483,0.2628,0.3461,0.2399,-0.0071,0.1427
ABNB,,,,,,0.0144,0.1965,-0.5049,0.6035,0.0181
ABT,0.0214,-0.0828,0.4954,0.2529,0.2705,0.2791,0.3098,-0.1971,0.0246,0.0733
...,...,...,...,...,...,...,...,...,...,...
YUM,0.0321,0.2488,0.3134,0.1462,0.1200,0.0830,0.3342,-0.0439,0.0485,0.0879
ZBH,-0.0810,0.0227,0.1774,-0.1565,0.4745,0.0413,-0.1650,0.0170,-0.0438,-0.1101
ZBRA,-0.1005,0.2898,0.2035,0.5353,0.6349,0.4831,0.5741,-0.5609,0.0505,0.5274
ZTS,0.1145,0.1419,0.3540,0.1990,0.5766,0.2409,0.4999,-0.3692,0.3563,-0.0934


In [25]:
definitive = pd.merge (df, defi, on='symbol')
definitive.tail(10)

Unnamed: 0,exchange,symbol,shortname,longname,sector,industry,currentprice,marketcap,ebitda,revenuegrowth,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
493,NMS,CZR,"Caesars Entertainment, Inc.","Caesars Entertainment, Inc.",Consumer Cyclical,Resorts & Casinos,37.6,7989247488,3668000000.0,-0.04,...,1.7228,0.6006,0.9443,0.1441,0.5715,0.248,0.3063,-0.5549,0.1093,-0.2131
494,NMS,APA,APA Corporation,APA Corporation,Energy,Oil & Gas E&P,21.42,7924264960,5047000000.0,0.104,...,-0.2906,0.4579,-0.3246,-0.3937,-0.0209,-0.432,0.8327,0.67,-0.1774,-0.4068
495,NMS,PARA,Paramount Global,Paramount Global,Communication Services,Entertainment,10.99,7868004352,3125000000.0,-0.056,...,-0.1299,0.3805,-0.0729,-0.2508,-0.0702,-0.0784,-0.1568,-0.4552,-0.1143,-0.2261
496,NYQ,CE,Celanese Corporation,Celanese Corporation,Basic Materials,Chemicals,70.54,7710868480,1851000000.0,-0.028,...,0.1425,0.2159,0.3718,-0.1405,0.3795,0.0976,0.3594,-0.3868,0.5192,-0.5441
497,NMS,WBA,"Walgreens Boots Alliance, Inc.","Walgreens Boots Alliance, Inc.",Healthcare,Pharmaceutical Retailers,8.56,7401122304,2884000000.0,0.06,...,0.1392,0.015,-0.1073,-0.0665,-0.1061,-0.2952,0.308,-0.2633,-0.2474,-0.6525
498,NYQ,BWA,BorgWarner Inc.,BorgWarner Inc.,Consumer Cyclical,Auto Parts,33.8,7392059904,1882000000.0,-0.048,...,-0.2022,-0.0517,0.2918,-0.3189,0.2572,-0.1037,0.2019,-0.094,0.0245,-0.0471
499,NYQ,HII,"Huntington Ingalls Industries,","Huntington Ingalls Industries, Inc.",Industrials,Aerospace & Defense,187.95,7354351616,1071000000.0,-0.024,...,0.1491,0.5029,0.2615,-0.1545,0.3244,-0.3181,0.1651,0.268,0.1633,-0.262
500,NYQ,FMC,FMC Corporation,FMC Corporation,Basic Materials,Agricultural Inputs,56.58,7063221248,703300000.0,0.085,...,-0.3028,0.5106,0.6741,-0.2186,0.5627,0.1725,-0.0103,0.1542,-0.4807,-0.1023
501,NMS,QRVO,"Qorvo, Inc.","Qorvo, Inc.",Technology,Semiconductors,68.33,6459010048,673130000.0,-0.052,...,-0.277,0.0396,0.259,-0.1183,0.8998,0.4339,-0.0498,-0.4315,0.2633,-0.3726
502,NYQ,AMTM,"Amentum Holdings, Inc.","Amentum Holdings, Inc.",Industrials,Specialty Business Services,23.17,5637307392,433000000.0,-0.031,...,,,,,,,,,,-0.2148


In [26]:
definitive.shape

(503, 26)

In [27]:
definitive = definitive.rename(columns={2015: 'ar_2015',2016:'ar_2016',2017: 'ar_2017', 
                                          2018:'ar_2018', 2019: 'ar_2019',2020: 'ar_2020', 2021: 'ar_2021', 2022:'ar_2022', 2023:'ar_2023',2024: 'ar_2024'})

In [28]:
#Dropping Columns: Exchange, longname, longbusinesssumary. 
cols = ['symbol', 'shortname','sector','industry',
        'marketcap','ebitda', 'revenuegrowth', 'city', 'state', 'country',   
        'fulltimeemployees', 'weight', 'ar_2015', 'ar_2016', 'ar_2017', 'ar_2018', 'ar_2019',
        'ar_2020', 'ar_2021', 'ar_2022', 'ar_2023', 'ar_2024', 'currentprice']

In [29]:
definitive = definitive[cols]

In [30]:
definitive

Unnamed: 0,symbol,shortname,sector,industry,marketcap,ebitda,revenuegrowth,city,state,country,...,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
0,AAPL,Apple Inc.,Technology,Consumer Electronics,3670720643072,1.346610e+11,0.061,Cupertino,CA,United States,...,0.1238,0.4804,-0.0705,0.8874,0.7824,0.3806,-0.2820,0.5480,0.3145,242.84
1,NVDA,NVIDIA Corporation,Technology,Semiconductors,3488355713024,6.118400e+10,1.224,Santa Clara,CA,United States,...,2.3292,0.9043,-0.3285,0.7341,1.1802,1.2448,-0.5144,2.4610,1.9580,142.44
2,MSFT,Microsoft Corporation,Technology,Software - Infrastructure,3297889746944,1.365520e+11,0.160,Redmond,WA,United States,...,0.1651,0.3974,0.2022,0.5826,0.3994,0.5579,-0.2769,0.5835,0.2050,443.57
3,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,2387220627456,1.115830e+11,0.110,Seattle,WA,United States,...,0.1772,0.5517,0.2632,0.2006,0.7160,0.0464,-0.5071,0.7704,0.5142,227.03
4,GOOG,Alphabet Inc.,Communication Services,Internet Content & Information,2149065949184,1.234700e+11,0.151,Mountain View,CA,United States,...,0.0404,0.3311,-0.0276,0.2784,0.2812,0.6743,-0.3884,0.5711,0.2646,176.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,BWA,BorgWarner Inc.,Consumer Cyclical,Auto Parts,7392059904,1.882000e+09,-0.048,Auburn Hills,MI,United States,...,-0.0517,0.2918,-0.3189,0.2572,-0.1037,0.2019,-0.0940,0.0245,-0.0471,33.80
499,HII,"Huntington Ingalls Industries,",Industrials,Aerospace & Defense,7354351616,1.071000e+09,-0.024,Newport News,VA,United States,...,0.5029,0.2615,-0.1545,0.3244,-0.3181,0.1651,0.2680,0.1633,-0.2620,187.95
500,FMC,FMC Corporation,Basic Materials,Agricultural Inputs,7063221248,7.033000e+08,0.085,Philadelphia,PA,United States,...,0.5106,0.6741,-0.2186,0.5627,0.1725,-0.0103,0.1542,-0.4807,-0.1023,56.58
501,QRVO,"Qorvo, Inc.",Technology,Semiconductors,6459010048,6.731300e+08,-0.052,Greensboro,NC,United States,...,0.0396,0.2590,-0.1183,0.8998,0.4339,-0.0498,-0.4315,0.2633,-0.3726,68.33


In [31]:
cat = definitive.select_dtypes(exclude='number')
cat.head(5)

Unnamed: 0,symbol,shortname,sector,industry,city,state,country
0,AAPL,Apple Inc.,Technology,Consumer Electronics,Cupertino,CA,United States
1,NVDA,NVIDIA Corporation,Technology,Semiconductors,Santa Clara,CA,United States
2,MSFT,Microsoft Corporation,Technology,Software - Infrastructure,Redmond,WA,United States
3,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,Seattle,WA,United States
4,GOOG,Alphabet Inc.,Communication Services,Internet Content & Information,Mountain View,CA,United States


In [32]:
num = definitive.select_dtypes(include='number')
num.head(5)

Unnamed: 0,marketcap,ebitda,revenuegrowth,fulltimeemployees,weight,ar_2015,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
0,3670720643072,134661000000.0,0.061,164000.0,0.064589,-0.0208,0.1238,0.4804,-0.0705,0.8874,0.7824,0.3806,-0.282,0.548,0.3145,242.84
1,3488355713024,61184000000.0,1.224,29600.0,0.061381,0.6645,2.3292,0.9043,-0.3285,0.7341,1.1802,1.2448,-0.5144,2.461,1.958,142.44
2,3297889746944,136552000000.0,0.16,228000.0,0.058029,0.2188,0.1651,0.3974,0.2022,0.5826,0.3994,0.5579,-0.2769,0.5835,0.205,443.57
3,2387220627456,111583000000.0,0.11,1551000.0,0.042005,1.1907,0.1772,0.5517,0.2632,0.2006,0.716,0.0464,-0.5071,0.7704,0.5142,227.03
4,2149065949184,123470000000.0,0.151,181269.0,0.037815,0.45,0.0404,0.3311,-0.0276,0.2784,0.2812,0.6743,-0.3884,0.5711,0.2646,176.49


In [33]:
defi

year,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,0.0418,0.1319,0.4535,0.0072,0.3098,0.3875,0.3512,-0.0374,-0.0666,0.0183
AAPL,-0.0208,0.1238,0.4804,-0.0705,0.8874,0.7824,0.3806,-0.2820,0.5480,0.3145
ABBV,-0.0714,0.1290,0.6061,-0.0267,0.0483,0.2628,0.3461,0.2399,-0.0071,0.1427
ABNB,,,,,,0.0144,0.1965,-0.5049,0.6035,0.0181
ABT,0.0214,-0.0828,0.4954,0.2529,0.2705,0.2791,0.3098,-0.1971,0.0246,0.0733
...,...,...,...,...,...,...,...,...,...,...
YUM,0.0321,0.2488,0.3134,0.1462,0.1200,0.0830,0.3342,-0.0439,0.0485,0.0879
ZBH,-0.0810,0.0227,0.1774,-0.1565,0.4745,0.0413,-0.1650,0.0170,-0.0438,-0.1101
ZBRA,-0.1005,0.2898,0.2035,0.5353,0.6349,0.4831,0.5741,-0.5609,0.0505,0.5274
ZTS,0.1145,0.1419,0.3540,0.1990,0.5766,0.2409,0.4999,-0.3692,0.3563,-0.0934


In [45]:
frequency_table = cat.sector.value_counts()
proportion_table = cat.sector.value_counts(normalize=True)

In [None]:
frequency_proportion = pd.concat([frequency_table,proportion_table], axis = 1)
frequency_proportion.columns = ['absolute_frequency', 'relative_frequency']
frequency_proportion

In [47]:
total_absolute = frequency_table.sum()
total_relative = proportion_table.sum()

In [None]:
total_row = pd.DataFrame({
    'absolute_frequency': [total_absolute],
    'relative_frequency': [total_relative]
}, index=['Total'])
total_row

In [49]:
df3 = pd.concat([frequency_proportion, total_row])

In [None]:
df3

In [None]:
sns.barplot(
    x='sector',
    y='absolute_frequency',
    data=frequency_proportion,
    palette='viridis',
)
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()

The frequency table gives the count of each sector, while the proportion table provides the percentage representation of each category in the dataset. This helps to quickly identify dominant and minority categories. Such as:
- Technology: 16% 
- Industrials: 14% 
- Financial Services: 13% 
- Healthcare: 12% 
- Consumer Cyclical: 10% 

In [42]:
tech = definitive[definitive['sector'] == 'Technology']
indus = definitive[definitive['sector'] == 'Industrials']
fin = definitive[definitive['sector'] == 'Financial Services']
health = definitive[definitive['sector'] == 'Healthcare']
consumer = definitive[definitive['sector'] == 'Consumer Cyclical']

In [45]:
tech.head()

Unnamed: 0,symbol,shortname,sector,industry,marketcap,ebitda,revenuegrowth,city,state,country,...,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
0,AAPL,Apple Inc.,Technology,Consumer Electronics,3670720643072,134661000000.0,0.061,Cupertino,CA,United States,...,0.1238,0.4804,-0.0705,0.8874,0.7824,0.3806,-0.282,0.548,0.3145,242.84
1,NVDA,NVIDIA Corporation,Technology,Semiconductors,3488355713024,61184000000.0,1.224,Santa Clara,CA,United States,...,2.3292,0.9043,-0.3285,0.7341,1.1802,1.2448,-0.5144,2.461,1.958,142.44
2,MSFT,Microsoft Corporation,Technology,Software - Infrastructure,3297889746944,136552000000.0,0.16,Redmond,WA,United States,...,0.1651,0.3974,0.2022,0.5826,0.3994,0.5579,-0.2769,0.5835,0.205,443.57
9,AVGO,Broadcom Inc.,Technology,Semiconductors,838509264896,22958000000.0,0.164,Palo Alto,CA,United States,...,0.4501,0.7581,0.5475,0.8481,1.1804,0.8393,-0.1571,1.0168,0.6541,179.53
14,ORCL,Oracle Corporation,Technology,Software - Infrastructure,531184484352,21803000000.0,0.069,Austin,TX,United States,...,0.0877,0.2413,-0.0162,0.1916,0.2202,0.3891,-0.0666,0.2593,0.8421,191.69


In [44]:
indus.head()

Unnamed: 0,symbol,shortname,sector,industry,marketcap,ebitda,revenuegrowth,city,state,country,...,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
48,CAT,"Caterpillar, Inc.",Industrials,Farm & Heavy Construction Machinery,190721261568,15819000000.0,-0.042,Irving,TX,United States,...,0.4389,0.7271,-0.1727,0.2017,0.2456,0.1587,0.1629,0.2377,0.3496,395.03
49,GE,GE Aerospace,Industrials,Aerospace & Defense,190028480512,8935000000.0,0.058,Evendale,OH,United States,...,0.0612,-0.4308,-0.567,0.4476,-0.0901,0.1314,-0.1256,0.9297,0.7554,175.58
62,RTX,RTX Corporation,Industrials,Aerospace & Defense,157047046144,12571000000.0,0.492,Arlington,VA,United States,...,0.1775,0.1783,-0.1505,0.414,-0.2368,0.2881,0.1876,-0.1434,0.417,117.99
67,HON,Honeywell International Inc.,Industrials,Conglomerates,147202916352,9164000000.0,0.056,Charlotte,NC,United States,...,0.1606,0.3472,-0.0848,0.37,0.2039,0.0198,0.0577,0.0002,0.1063,226.38
69,ETN,"Eaton Corporation, PLC",Industrials,Specialty Industrial Machinery,146706137088,5450000000.0,0.079,Dublin,,Ireland,...,0.3336,0.1899,-0.1142,0.4287,0.282,0.4842,-0.0489,0.5543,0.5731,371.22


In [46]:
fin.head()

Unnamed: 0,symbol,shortname,sector,industry,marketcap,ebitda,revenuegrowth,city,state,country,...,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
8,BRK-B,Berkshire Hathaway Inc. New,Financial Services,Insurance - Diversified,1013791064064,149547000000.0,-0.002,Omaha,NE,United States,...,0.2465,0.2099,0.0353,0.1169,0.0152,0.3088,0.027,0.1509,0.2981,470.5
12,JPM,JP Morgan Chase & Co.,Financial Services,Banks - Diversified,696402509824,,0.03,New York,NY,United States,...,0.387,0.2539,-0.075,0.4475,-0.0667,0.2896,-0.1445,0.2964,0.4718,247.36
13,V,Visa Inc.,Financial Services,Credit Services,601857261568,24973000000.0,0.117,San Francisco,CA,United States,...,0.0406,0.4444,0.1599,0.4227,0.1515,0.0013,-0.0602,0.2554,0.2014,311.01
17,MA,Mastercard Incorporated,Financial Services,Credit Services,485137940480,16784000000.0,0.128,Purchase,NY,United States,...,0.0978,0.4469,0.2486,0.5824,0.1828,0.0273,-0.0614,0.2298,0.2529,528.57
23,BAC,Bank of America Corporation,Financial Services,Banks - Diversified,358707134464,,-0.005,Charlotte,NC,United States,...,0.3662,0.331,-0.1608,0.4432,-0.1267,0.51,-0.2793,0.0048,0.3791,46.75


In [47]:
health.head()

Unnamed: 0,symbol,shortname,sector,industry,marketcap,ebitda,revenuegrowth,city,state,country,...,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
11,LLY,Eli Lilly and Company,Healthcare,Drug Manufacturers - General,743473545216,16566500000.0,0.204,Indianapolis,IN,United States,...,-0.0887,0.1618,0.4009,0.1699,0.3027,0.6943,0.3647,0.6128,0.4051,826.71
15,UNH,UnitedHealth Group Incorporated,Healthcare,Healthcare Plans,505806520320,35035000000.0,0.092,Minnetonka,MN,United States,...,0.3981,0.3861,0.1416,0.2278,0.2186,0.4569,0.0691,0.0304,0.0311,549.62
22,JNJ,Johnson & Johnson,Healthcare,Drug Manufacturers - General,359481737216,30052000000.0,0.052,New Brunswick,NJ,United States,...,0.179,0.2375,-0.048,0.174,0.1075,0.1206,0.0364,-0.1204,-0.0666,149.31
25,ABBV,AbbVie Inc.,Healthcare,Drug Manufacturers - General,311352393728,25630000000.0,0.038,North Chicago,IL,United States,...,0.129,0.6061,-0.0267,0.0483,0.2628,0.3461,0.2399,-0.0071,0.1427,176.19
29,MRK,"Merck & Company, Inc.",Healthcare,Drug Manufacturers - General,260780572672,22928000000.0,0.044,Rahway,NJ,United States,...,0.158,-0.0358,0.4008,0.236,-0.083,0.0281,0.4897,0.0084,-0.0722,103.09


In [48]:
consumer.head()

Unnamed: 0,symbol,shortname,sector,industry,marketcap,ebitda,revenuegrowth,city,state,country,...,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
3,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,2387220627456,111583000000.0,0.11,Seattle,WA,United States,...,0.1772,0.5517,0.2632,0.2006,0.716,0.0464,-0.5071,0.7704,0.5142,227.03
7,TSLA,"Tesla, Inc.",Consumer Cyclical,Auto Manufacturers,1249419591680,13244000000.0,0.078,Austin,TX,United States,...,-0.0435,0.4349,0.0383,0.3489,7.2005,0.4481,-0.692,1.2986,0.5668,389.22
19,HD,"Home Depot, Inc. (The)",Consumer Cyclical,Home Improvement Retail,428507004928,24758000000.0,0.066,Atlanta,GA,United States,...,0.0447,0.4437,-0.0656,0.3011,0.2378,0.6053,-0.2076,0.1275,0.2807,431.37
39,MCD,McDonald's Corporation,Consumer Cyclical,Restaurants,214255058944,13845500000.0,0.027,Chicago,IL,United States,...,0.0671,0.476,0.0511,0.1495,0.0953,0.3044,0.0032,0.1471,0.0309,298.98
55,BKNG,Booking Holdings Inc. Common St,Consumer Cyclical,Travel Services,175423750144,7183000000.0,0.089,Norwalk,CT,United States,...,0.1772,0.1761,-0.0349,0.1929,0.0736,0.1087,-0.1813,0.7455,0.5338,5300.34


In [50]:
definitive[definitive['symbol'] == 'COST']

Unnamed: 0,symbol,shortname,sector,industry,marketcap,ebitda,revenuegrowth,city,state,country,...,ar_2016,ar_2017,ar_2018,ar_2019,ar_2020,ar_2021,ar_2022,ar_2023,ar_2024,currentprice
18,COST,Costco Wholesale Corporation,Consumer Defensive,Discount Stores,439799676928,11522000000.0,0.01,Issaquah,WA,United States,...,0.0151,0.2266,0.0931,0.4495,0.3377,0.5048,-0.1891,0.5006,0.5345,992.61


In [None]:
df3.columns

In [None]:
tech['revenuegrowth'].mean()

In [None]:
consumer['revenuegrowth'].mean()

In [None]:
# Found out in Kaggle related notebooks. 
f = {'revenuegrowth':['mean'], 'marketcap':['sum'], 'shortname':['count']}

sector_breakdown = definitive.groupby('sector').agg(f)
sector_breakdown.columns = sector_breakdown.columns.get_level_values(0)
sector_breakdown = sector_breakdown.reset_index()
sector_breakdown = sector_breakdown.sort_values('shortname', ascending=False)

fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')

plt.subplot(1, 3, 1)
ax1 = sns.barplot(x="shortname", y="sector", data=sector_breakdown, palette=('coolwarm'))
ax1.set_xlabel('Number of companies', weight='bold')
ax1.set_ylabel('sector', weight = 'bold')
ax1.set_title('SECTOR BREAKDOWN\n', weight='bold').set_fontsize('18')

plt.subplot(1, 3, 2)
ax2 = sns.barplot(x="marketcap", y="sector", data=sector_breakdown, palette=('Wistia'))
ax2.set_xlabel('Total Market Cap', weight='bold')
ax2.set_ylabel('')
ax2.set_yticks([])

plt.subplot(1, 3, 3)
ax2 = sns.barplot(x="revenuegrowth", y="sector", data=sector_breakdown, palette=('OrRd_r'))
ax2.set_xlabel('Revenue Growth', weight='bold')
ax2.set_ylabel('')
ax2.set_yticks([])

sns.despine()
plt.tight_layout();

In [None]:
round(definitive.groupby('sector')['revenuegrowth'].agg(['count','mean', 'median', 'max']), 2)

In [None]:
results = definitive.groupby('sector')[['ar_2015', 'ar_2016', 'ar_2017','ar_2018', 'ar_2019', 'ar_2020','ar_2021','ar_2022','ar_2023','ar_2024' ]].agg(['mean']).round(4)
results


In [None]:
plt.figure(figsize=(10, 8)) 
sns.heatmap(results, annot=True, cmap="BuPu", fmt=".3f")
plt.show()

In [34]:
sorted_df = defi.sort_values(by=2024, ascending=False)
sorted_df

year,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PLTR,,,,,,1.4789,-0.2208,-0.6535,1.6870,3.6043
VST,,0.1662,0.1927,0.2427,0.0480,-0.1129,0.2257,0.0521,0.7787,3.2378
NVDA,0.6645,2.3292,0.9043,-0.3285,0.7341,1.1802,1.2448,-0.5144,2.4610,1.9580
AXON,-0.3478,0.4403,0.0781,0.6478,0.6245,0.6004,0.3436,0.0854,0.5355,1.7329
TPL,0.1117,1.3837,0.4975,0.1851,0.3910,-0.0243,0.6648,0.8750,-0.2463,1.5535
...,...,...,...,...,...,...,...,...,...,...
DLTR,0.0977,-0.0207,0.3855,-0.1690,0.0313,0.1521,0.3140,0.0019,0.0128,-0.4958
CE,0.1425,0.2159,0.3718,-0.1405,0.3795,0.0976,0.3594,-0.3868,0.5192,-0.5441
INTC,-0.0240,0.1023,0.2969,0.0269,0.3030,-0.1608,0.0637,-0.4836,0.9238,-0.5565
MRNA,,,,-0.1790,0.2759,4.4327,1.2732,-0.2358,-0.4444,-0.6050


In [36]:
sorted_df = sorted_df.reset_index()
sorted_df

year,symbol,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,PLTR,,,,,,1.4789,-0.2208,-0.6535,1.6870,3.6043
1,VST,,0.1662,0.1927,0.2427,0.0480,-0.1129,0.2257,0.0521,0.7787,3.2378
2,NVDA,0.6645,2.3292,0.9043,-0.3285,0.7341,1.1802,1.2448,-0.5144,2.4610,1.9580
3,AXON,-0.3478,0.4403,0.0781,0.6478,0.6245,0.6004,0.3436,0.0854,0.5355,1.7329
4,TPL,0.1117,1.3837,0.4975,0.1851,0.3910,-0.0243,0.6648,0.8750,-0.2463,1.5535
...,...,...,...,...,...,...,...,...,...,...,...
499,DLTR,0.0977,-0.0207,0.3855,-0.1690,0.0313,0.1521,0.3140,0.0019,0.0128,-0.4958
500,CE,0.1425,0.2159,0.3718,-0.1405,0.3795,0.0976,0.3594,-0.3868,0.5192,-0.5441
501,INTC,-0.0240,0.1023,0.2969,0.0269,0.3030,-0.1608,0.0637,-0.4836,0.9238,-0.5565
502,MRNA,,,,-0.1790,0.2759,4.4327,1.2732,-0.2358,-0.4444,-0.6050


In [37]:
sorted_df.columns.name = None

In [41]:
sorted_df

Unnamed: 0,symbol,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,PLTR,,,,,,1.4789,-0.2208,-0.6535,1.6870,3.6043
1,VST,,0.1662,0.1927,0.2427,0.0480,-0.1129,0.2257,0.0521,0.7787,3.2378
2,NVDA,0.6645,2.3292,0.9043,-0.3285,0.7341,1.1802,1.2448,-0.5144,2.4610,1.9580
3,AXON,-0.3478,0.4403,0.0781,0.6478,0.6245,0.6004,0.3436,0.0854,0.5355,1.7329
4,TPL,0.1117,1.3837,0.4975,0.1851,0.3910,-0.0243,0.6648,0.8750,-0.2463,1.5535
...,...,...,...,...,...,...,...,...,...,...,...
499,DLTR,0.0977,-0.0207,0.3855,-0.1690,0.0313,0.1521,0.3140,0.0019,0.0128,-0.4958
500,CE,0.1425,0.2159,0.3718,-0.1405,0.3795,0.0976,0.3594,-0.3868,0.5192,-0.5441
501,INTC,-0.0240,0.1023,0.2969,0.0269,0.3030,-0.1608,0.0637,-0.4836,0.9238,-0.5565
502,MRNA,,,,-0.1790,0.2759,4.4327,1.2732,-0.2358,-0.4444,-0.6050


In [None]:
color = '#9370DB'

nrows, ncols = 5, 4 

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

for i, ax in enumerate(axes):
    if i >= len(num.columns):
        ax.set_visible(False)  # hide unesed plots
        continue
    ax.hist(num.iloc[:, i], bins=30, color=color, edgecolor='black')
    ax.set_title(num.columns[i])

plt.tight_layout()
plt.show()

In [None]:
color = '#9370DB'

# grid size
nrows, ncols = 5, 4 

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

for i, ax in enumerate(axes):
    if i >= len(num.columns):
        ax.set_visible(False)
        continue
    ax.boxplot(num.iloc[:, i].dropna(), vert=False, patch_artist=True, 
               boxprops=dict(facecolor=color, color='black'), 
               medianprops=dict(color='yellow'), whiskerprops=dict(color='black'), 
               capprops=dict(color='black'), flierprops=dict(marker='o', color='red', markersize=5))
    ax.set_title(num.columns[i], fontsize=10)
    ax.tick_params(axis='x', labelsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Without the filter and having the outliers. 
sns.histplot(definitive["currentprice"], color=color, kde=True);

In [None]:
d2 = sns.displot(data=definitive, x='currentprice', kde=True, height=8, aspect=1.6, bins=100, binrange=(0, 2100), color='mediumpurple')
d2.set(xlabel='Current Price')
plt.xlim(0, 2100)

In [None]:
# Check for this plot. 
sns.set(style='darkgrid')
plt.figure(figsize=(15,12))
sns.pairplot(df, corner=True, hue='sector')
plt.tight_layout()

In [None]:
#Pearson 
num.corrwith(defi['currentprice'])

In [None]:
#Spearman
num.corrwith(defi['currentprice'], method='spearman').sort_values(ascending=False)[:5]

In [None]:
num_corr = num.corr()
num_corr

In [None]:
# Correlation Matrix-Heatmap Plot
mask = np.zeros_like(num_corr)
mask[np.triu_indices_from(mask)] = True 
f, ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale=1.5)

ax = sns.heatmap(num_corr, mask=mask, annot=True, annot_kws={"size": 12}, linewidths=.5, cmap="BuPu", fmt=".2f", ax=ax) # round to 2 decimal places
ax.set_title("Correlation Heatmap", fontsize=20) 

In [None]:
# Plotting scatter plots for each numerical column against 'currentprice' to visualize their relationships
for col in num.columns:
    plt.figure(figsize=(5, 5))
    plt.title('Scatter plot of price vs ' + col)
    sns.scatterplot(data=defi, x=col, y='currentprice')
    plt.show()

In [46]:
def outlier_slayer(data): 
    """
    Automatically removes outliers based on Q1, Q3
    """
    for column in data.select_dtypes(include=[np.number]):
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return data

In [47]:
definitive = outlier_slayer(defi)

In [None]:
cat = definitive.select_dtypes(exclude='number')
cat.head(5)

In [None]:
num = definitive.select_dtypes(include='number')
num.head(5)

In [None]:
color = '#9370DB'

nrows, ncols = 5, 4 

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

for i, ax in enumerate(axes):
    if i >= len(num.columns):
        ax.set_visible(False)  # hide unesed plots
        continue
    ax.hist(num.iloc[:, i], bins=30, color=color, edgecolor='black')
    ax.set_title(num.columns[i])

plt.tight_layout()
plt.show()

In [None]:
# Dealing with outliers. 
sns.histplot(definitive["currentprice"], color=color, kde=True);

In [None]:
# Without dealing with outliers. 
sns.histplot(defi["currentprice"], color=color, kde=True);

In [None]:
# Without dealing with outliers. REVENUE
sns.histplot(defi["revenuegrowth"], color=color, kde=True);

In [None]:
# Dealing with outliers. REVENUE
sns.histplot(definitive["revenuegrowth"], color=color, kde=True);

In [None]:
#pearson 
num.corrwith(definitive['currentprice']).sort_values(ascending=False)

In [None]:
#Spearman
num.corrwith(df['currentprice'], method='spearman').sort_values(ascending=False)[:5]

In [None]:
#pearson 
num.corrwith(definitive['revenuegrowth']).sort_values(ascending=False)

In [None]:
#Spearman
num.corrwith(df['revenuegrowth'], method='spearman').sort_values(ascending=False)[:5]

In [None]:
color = '#9370DB'

# grid size
nrows, ncols = 5, 4 

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

for i, ax in enumerate(axes):
    if i >= len(num.columns):
        ax.set_visible(False)
        continue
    ax.boxplot(num.iloc[:, i].dropna(), vert=False, patch_artist=True, 
               boxprops=dict(facecolor=color, color='black'), 
               medianprops=dict(color='yellow'), whiskerprops=dict(color='black'), 
               capprops=dict(color='black'), flierprops=dict(marker='o', color='red', markersize=5))
    ax.set_title(num.columns[i], fontsize=10)
    ax.tick_params(axis='x', labelsize=8)

plt.tight_layout()
plt.show()

In [None]:
num_corr = num.corr()
num_corr

In [None]:
# Correlation Matrix-Heatmap Plot
mask = np.zeros_like(num_corr)
mask[np.triu_indices_from(mask)] = True 
f, ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale=1.5)

ax = sns.heatmap(num_corr, mask=mask, annot=True, annot_kws={"size": 12}, linewidths=.5, cmap="BuPu", fmt=".2f", ax=ax) # round to 2 decimal places
ax.set_title("Correlation Heatmap", fontsize=20) 

In [None]:
# Plotting scatter plots for each numerical column against 'currentprice' to visualize their relationships
for col in num.columns:
    plt.figure(figsize=(5, 5))
    plt.title('Scatter plot of price vs ' + col)
    sns.scatterplot(data=definitive, x=col, y='currentprice')
    plt.show()

In [None]:
defi

In [None]:
# crosstab MSZoning and SaleCondition
crosstab_result = pd.crosstab(df['MSZoning'], df['SaleCondition'])
crosstab_result 

### T-Test: Two Tails (We could do it with the price of the stock or the revenue growth or ebitda)
We will conduct hypothesis testing to check if the **average price of 1st class tickets** on the Titanic was $65. We will use a **t-test** since we don't know the population standard deviation.

#### Key Questions:
- Are first-class ticket prices consistent with historical assumptions of $65?
- What are the hypotheses, and how can we test them statistically?

In [None]:
# Step 2: Choose Significance Level
alpha = 0.05  # 5% significance level
# Step 3: Collect Data
first_class = df[df['Pclass'] == 1]['Fare'].dropna()
n = len(first_class)
n
# Display data information
print(f"Sample Size (n): {n}")
first_class.describe() # I do not know how to perfom just fare values. 
# Step 4: Calculate Test Statistic
mean = first_class.mean()
s = first_class.std()
mu = 65

In [None]:
t_stat, p_value = st.ttest_1samp(first_class, mu)
print(f"Test Statistic (t): {t_stat:.2f}")
print(f"P-Value: {p_value:.4f}")
print()
if p_value > alpha:
    print("Fail to Reject the Null Hypothesis: Not enough evidence to say the average price is different from $65.")
else:
    print("Reject the Null Hypothesis: There is evidence to say the average price is different from $65.")

In [None]:
# Nice to display information. 
# Display data information
print(f"Sample Size (n): {n}")
first_class.describe() # I do not know how to perfom just fare values. 

In [None]:
# ADD ANOVA 

We will use **one-way ANOVA** to determine if there is a statistically significant difference in **stock price** based on **sector**.

#### Define Hypotheses
- **Null Hypothesis (H₀)**: There is no difference in mean stock prices between sectors such as at **Technolgies**, **Industrials**, and **Finance** companies.
- **Alternative Hypothesis (H₁)**: At least one group mean is different.

In [None]:
# Extract salaries for Data Scientists by company size
df_small = df[(df["job_title"] == "Data Scientist") & (df["company_size"] == "Small")]["salary_in_usd"]
df_medium = df[(df["job_title"] == "Data Scientist") & (df["company_size"] == "Medium")]["salary_in_usd"]
df_large = df[(df["job_title"] == "Data Scientist") & (df["company_size"] == "Large")]["salary_in_usd"]

In [None]:
# Perform One-Way ANOVA
f_stat, p_value = st.f_oneway(df_small, df_medium, df_large)
print(f"F-Statistic: {f_stat:.2f}")
print(f"P-Value: {p_value:.4f}")
print()

# Significance level
alpha = 0.05

# Decision-Making
if p_value > alpha:
    print("Fail to Reject the Null Hypothesis: Company size has no significant impact on data scientist salaries.")
else:
    print("Reject the Null Hypothesis: There is a significant difference in salaries based on company size.")