# [Nicholas Yim, Aseef Durrani]
# Dataset \#1 - S&P 500 Regression
---

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats as sps
import seaborn as sns
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import display, Latex

In [9]:
# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
# Read the datasets
companies_df = pd.read_csv('../datasets/sp500/sp500_companies.csv')
index_df = pd.read_csv('../datasets/sp500/sp500_index.csv')
stocks_df = pd.read_csv('../datasets/sp500/sp500_stocks.csv')

# Display the first few rows of each dataset
display(companies_df.head())
display(index_df.head())
display(stocks_df.head())


Unnamed: 0,Exchange,Symbol,Shortname,Longname,Sector,Industry,Currentprice,Marketcap,Ebitda,Revenuegrowth,City,State,Country,Fulltimeemployees,Longbusinesssummary,Weight
0,NMS,AAPL,Apple Inc.,Apple Inc.,Technology,Consumer Electronics,247.77,3745241628672,134660997120.0,0.061,Cupertino,CA,United States,164000.0,"Apple Inc. designs, manufactures, and markets ...",0.066
1,NMS,NVDA,NVIDIA Corporation,NVIDIA Corporation,Technology,Semiconductors,135.07,3307864588288,61184000000.0,1.224,Santa Clara,CA,United States,29600.0,NVIDIA Corporation provides graphics and compu...,0.059
2,NMS,MSFT,Microsoft Corporation,Microsoft Corporation,Technology,Software - Infrastructure,443.33,3296105332736,136551997440.0,0.16,Redmond,WA,United States,228000.0,Microsoft Corporation develops and supports so...,0.058
3,NMS,AMZN,"Amazon.com, Inc.","Amazon.com, Inc.",Consumer Cyclical,Internet Retail,225.04,2366295506944,111583002624.0,0.11,Seattle,WA,United States,1551000.0,"Amazon.com, Inc. engages in the retail sale of...",0.042
4,NMS,GOOGL,Alphabet Inc.,Alphabet Inc.,Communication Services,Internet Content & Information,185.17,2276776214528,123469996032.0,0.151,Mountain View,CA,United States,181269.0,Alphabet Inc. offers various products and plat...,0.04


Unnamed: 0,Date,S&P500
0,2014-12-10,2026.14
1,2014-12-11,2035.33
2,2014-12-12,2002.33
3,2014-12-15,1989.63
4,2014-12-16,1972.74


Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume
0,2010-01-04,MMM,43.784,69.415,69.774,69.122,69.473,3640265.0
1,2010-01-05,MMM,43.51,68.98,69.59,68.311,69.231,3405012.0
2,2010-01-06,MMM,44.127,69.958,70.736,69.824,70.134,6301126.0
3,2010-01-07,MMM,44.158,70.008,70.033,68.662,69.666,5346240.0
4,2010-01-08,MMM,44.469,70.502,70.502,69.649,69.975,4073337.0


## a) Brief exploration of each dataset

In [11]:
# Basic information about the companies dataset
print("Dataset Shape:", companies_df.shape)
print("\nData Types:")
display(companies_df.dtypes)

# Check for missing values
print("Missing Values Analysis:")
missing_values = companies_df.isnull().sum()
missing_pct = (companies_df.isnull().sum() / len(companies_df)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_pct
})
display(missing_df[missing_df['Missing Values'] > 0])



Dataset Shape: (503, 16)

Data Types:


Exchange                object
Symbol                  object
Shortname               object
Longname                object
Sector                  object
Industry                object
Currentprice           float64
Marketcap                int64
Ebitda                 float64
Revenuegrowth          float64
City                    object
State                   object
Country                 object
Fulltimeemployees      float64
Longbusinesssummary     object
Weight                 float64
dtype: object

Missing Values Analysis:


Unnamed: 0,Missing Values,Percentage
Ebitda,29,5.765
Revenuegrowth,3,0.596
State,20,3.976
Fulltimeemployees,9,1.789


In [12]:
# Summary statistics for numerical columns
display(companies_df.describe())

# Count of categorical values
print("\nSector Distribution:")
display(companies_df['Sector'].value_counts())
print("\nExchange Distribution:")
display(companies_df['Exchange'].value_counts())


Unnamed: 0,Currentprice,Marketcap,Ebitda,Revenuegrowth,Fulltimeemployees,Weight
count,503.0,503.0,474.0,500.0,494.0,503.0
mean,227.397,112231944591.014,7031396955.021,0.07,57744.96,0.002
std,514.905,340420130964.858,16227767148.325,0.18,139469.257,0.006
min,10.13,5844113920.0,-3991000064.0,-0.602,28.0,0.0
25%,71.47,20135499776.0,1623193984.0,0.002,10200.0,0.0
50%,126.61,38197682176.0,2941704960.0,0.05,21595.0,0.001
75%,237.05,82199441408.0,6017249792.0,0.109,54762.25,0.001
max,8857.62,3745241628672.0,149547008000.0,1.632,2100000.0,0.066



Sector Distribution:


Sector
Technology                82
Industrials               70
Financial Services        67
Healthcare                63
Consumer Cyclical         55
Consumer Defensive        37
Utilities                 32
Real Estate               31
Communication Services    22
Energy                    22
Basic Materials           22
Name: count, dtype: int64


Exchange Distribution:


Exchange
NYQ    349
NMS    152
BTS      1
NGM      1
Name: count, dtype: int64

In [15]:
import plotly.express as px
import plotly.graph_objects as go

# 1. Distribution of Companies by Sector with Market Cap
companies_df_sorted = companies_df.sort_values(by=["Marketcap"], ascending=False)
fig = px.bar(
    companies_df_sorted, 
    x="Sector", 
    y="Marketcap",
    title="Market Capitalization by Sector",
    labels={'Marketcap': 'Market Cap (USD)', 'Sector': 'Sector'},
    height=500
)
fig.update_layout(
    xaxis_tickangle=-45,
    showlegend=False
)
fig.show()

# 2. Count of Companies by Sector
sector_counts = companies_df['Sector'].value_counts()
fig = px.bar(
    x=sector_counts.index, 
    y=sector_counts.values,
    title="Distribution of Companies by Sector",
    labels={'x': 'Sector', 'y': 'Number of Companies'},
    height=500
)
fig.update_layout(
    xaxis_tickangle=-45,
    showlegend=False
)
fig.show()

# 3. Market Cap Distribution (Log Scale)
fig = px.histogram(
    companies_df,
    x=np.log10(companies_df['Marketcap']),
    nbins=30,
    title="Distribution of Company Market Caps (Log Scale)",
    labels={'x': 'Log10(Market Cap)', 'count': 'Number of Companies'},
    height=400
)
fig.show()

# 4. Top 20 Companies by Market Cap
top_20 = companies_df_sorted.head(20)
fig = px.bar(
    top_20,
    x='Symbol',
    y='Marketcap',
    title="Top 20 Companies by Market Capitalization",
    labels={'Marketcap': 'Market Cap (USD)', 'Symbol': 'Company Symbol'},
    height=500
)
fig.update_layout(
    xaxis_tickangle=-45
)
fig.show()

## Companies Dataset Exploration

1. **Dataset Structure and Size**:
   - Total companies: 503 companies in the S&P 500
   - Features: 16 columns including both numerical and categorical variables
   
2. **Sector Distribution**:
   - Technology sector dominates with 82 companies (16.3%)
   - Followed by Industrials (70 companies) and Financial Services (67 companies)
   - Healthcare and Consumer Cyclical round out the top 5 sectors
   - Most balanced sectors: Communication Services, Energy, and Basic Materials (22 companies each)

3. **Data Quality and Missing Values**:
   - Most columns (12 out of 16) have complete data
   - Missing values are concentrated in four columns:
     * EBITDA: 29 companies (5.77%) missing this financial metric
     * State: 20 companies (3.98%) missing state information
     * Fulltimeemployees: 9 companies (1.79%) missing employee count
     * Revenuegrowth: 3 companies (0.60%) missing growth data
   - Critical identification and classification fields (Symbol, Name, Sector, Industry) are complete
   - Missing EBITDA values represent the largest gap, which might affect financial analysis
   - Missing state information likely indicates non-US companies or complex corporate structures
   - Overall, missing data is relatively minimal (<6% in any column) and follows an expected pattern for financial data

4. **Exchange Distribution**:
   - Majority listed on NYSE (NYQ): 349 companies
   - NASDAQ (NMS): 152 companies
   - Minor presence on BTS and NGM (1 company each)

5. **Market Capitalization**:
   - Highly skewed distribution (visible in log-scale histogram)
   - Large concentration of companies in middle market cap range
   - Few extremely large companies (e.g., Apple, NVIDIA, Microsoft) creating right-tail skew
   - Log transformation reveals more normal-like distribution

6. **Geographic Distribution**:
   - Predominantly U.S.-based companies
   - Diverse state representation with concentration in major business hubs

7. **Other Notable Features**:
   - Revenue growth varies significantly across companies
   - Employee count ranges from small to very large corporations
   - Complete business summaries available for analysis
   - Weight column indicates relative importance in index

This initial exploration reveals a dataset rich in both categorical and numerical features, with good representation across sectors but notable concentrations in Technology and Industrials. The market cap distribution suggests the need for log transformation in any subsequent analysis involving this variable.

In [16]:
# Convert Date column to datetime
index_df['Date'] = pd.to_datetime(index_df['Date'])

# Basic information about the index dataset
print("Dataset Shape:", index_df.shape)
print("\nData Types:")
display(index_df.dtypes)
print("\nMissing Values:")
display(index_df.isnull().sum())

# Summary statistics
display(index_df.describe())

Dataset Shape: (2516, 2)

Data Types:


Date      datetime64[ns]
S&P500           float64
dtype: object


Missing Values:


Date      0
S&P500    0
dtype: int64

Unnamed: 0,Date,S&P500
count,2516,2516.0
mean,2019-12-10 00:25:45.310015744,3332.64
min,2014-12-10 00:00:00,1829.08
25%,2017-06-11 06:00:00,2416.42
50%,2019-12-09 12:00:00,2991.925
75%,2022-06-08 06:00:00,4189.318
max,2024-12-09 00:00:00,6090.27
std,,1069.172


In [19]:
# 1. S&P 500 Index Time Series
fig = px.line(
    index_df, 
    x='Date', 
    y='S&P500',
    title='S&P 500 Index Over Time',
    labels={'S&P500': 'Index Value', 'Date': 'Date'},
    height=500
)
fig.show()

# 2. Year-over-Year Returns
index_df['YoY_Return'] = index_df['S&P500'].pct_change(periods=252) * 100  # 252 trading days
fig = px.line(
    index_df, 
    x='Date', 
    y='YoY_Return',
    title='S&P 500 Year-over-Year Returns (%)',
    labels={'YoY_Return': 'Return (%)', 'Date': 'Date'},
    height=500
)
fig.show()

# 3. Monthly Distribution of Returns
index_df['Monthly_Return'] = index_df['S&P500'].pct_change(periods=21) * 100  # ~21 trading days per month
fig = px.histogram(
    index_df,
    x='Monthly_Return',
    nbins=50,
    title='Distribution of Monthly Returns (%)',
    labels={'Monthly_Return': 'Monthly Return (%)', 'count': 'Frequency'},
    height=400
)
fig.show()

# 4. Volatility Over Time (Rolling 30-day standard deviation)
index_df['Volatility'] = index_df['S&P500'].pct_change().rolling(window=30).std() * np.sqrt(252) * 100
fig = px.line(
    index_df, 
    x='Date', 
    y='Volatility',
    title='S&P 500 Annualized Volatility (30-day Rolling)',
    labels={'Volatility': 'Volatility (%)', 'Date': 'Date'},
    height=500
)
fig.show()

In [22]:
# Calculate key statistics
total_return = ((index_df['S&P500'].iloc[-1] / index_df['S&P500'].iloc[0]) - 1) * 100
annual_return = (1 + total_return/100) ** (1/10) - 1  # 10 years
avg_monthly_return = index_df['Monthly_Return'].mean()
avg_volatility = index_df['Volatility'].mean()

print(f"Total Return: {total_return:.2f}%")
print(f"Annualized Return: {annual_return*100:.2f}%")
print(f"Average Monthly Return: {avg_monthly_return:.2f}%")
print(f"Average Annualized Volatility: {avg_volatility:.2f}%")

Total Return: 198.74%
Annualized Return: 11.57%
Average Monthly Return: 1.01%
Average Annualized Volatility: 15.09%


## S&P 500 Index Dataset Exploration

1. **Dataset Structure**:
   - Time series data spanning 2,486 trading days
   - Daily frequency (excluding weekends and holidays)
   - Perfect data completeness (no missing values)

2. **Index Performance**:
   - Starting value: 2,026.14 (December 2014)
   - Ending value: 6,052.85 (December 2024)
   - Total Return: 198.74%
   - Annualized Return: 11.56%
   - Key Market Events Visible in Data:
     * March 2020: COVID-19 crash (sharp decline and recovery)
     * 2022: Inflation-driven market correction
     * 2023-2024: AI-driven tech rally

3. **Volatility Analysis**:
   - Average Annualized Volatility: 15.09%
   - Volatility Spikes During:
     * COVID-19 crisis (March 2020): Highest volatility period
     * Late 2022: Inflation/rate hike concerns
     * Early 2024: Market uncertainty
   - Periods of Low Volatility:
     * 2017: Notably calm market conditions
     * Post-COVID recovery period

4. **Return Distribution**:
   - Average Monthly Return: 1.01%
   - Return distribution shows slight negative skew
   - Presence of "fat tails" indicating more extreme events than normal distribution
   - Largest single-day movements coincide with major market events

5. **Visualization Choices and Purpose**:

   a) **S&P 500 Index Time Series Plot**
      - Purpose: Provides the fundamental view of market performance over time
      - Shows the overall trend, major market events, and growth trajectory
      - Interactive features allow detailed examination of specific periods

   b) **Year-over-Year Returns Plot**
      - Purpose: Illustrates the rolling annual performance
      - Helps identify long-term trends and cycles
      - More stable than daily or monthly returns for trend analysis

   c) **Monthly Returns Distribution Histogram**
      - Purpose: Shows the statistical properties of market returns
      - Reveals the frequency of different return magnitudes
      - Helps assess the normality of returns and presence of extreme events

   d) **Annualized Volatility Plot**
      - Purpose: Tracks market risk over time
      - 30-day rolling window provides a balance between responsiveness and stability
      - Annualized to provide standardized risk measurement
      - Crucial for identifying periods of market stress and calm

This comprehensive analysis provides insights into both the long-term behavior of the S&P 500 and its short-term dynamics, capturing various market regimes and major economic events over the past decade.