# Module 1: Introduction and Data Sources

In [1]:
!pip install yfinance



In [2]:
import pandas as pd
from datetime import datetime

import yfinance as yf

### Question 1: [Index] S&P 500 Stocks Added to the Index

**Using the list of S&P 500 companies from Wikipedia's [S&P 500 companies page](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies), download the data including the year each company was added to the index.**

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

# The first table on the page contains the S&P 500 companies
sp500_table = pd.read_html(url)[0]

# Display the first few rows
sp500_table.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [4]:
sp500 = sp500_table.rename(columns={
    'Symbol': 'comp_ticker',
    'Security': 'comp_name',
    'Date added': 'date_added'
})

sp500 = sp500[['comp_ticker', 'comp_name', 'date_added']]
sp500.head()

Unnamed: 0,comp_ticker,comp_name,date_added
0,MMM,3M,1957-03-04
1,AOS,A. O. Smith,2017-07-26
2,ABT,Abbott Laboratories,1957-03-04
3,ABBV,AbbVie,2012-12-31
4,ACN,Accenture,2011-07-06


In [5]:
sp500['year_added'] = pd.to_datetime(sp500['date_added']).dt.year
yearly_counts = sp500.groupby('year_added')['comp_ticker'].count().reset_index()
yearly_counts.rename(columns={'comp_ticker': 'count_of_stocks'}, inplace=True)
yearly_counts.head()

Unnamed: 0,year_added,count_of_stocks
0,1957,53
1,1964,1
2,1965,2
3,1969,2
4,1970,2


In [6]:
max_value_stocks = yearly_counts[yearly_counts['year_added'] != 1957]['count_of_stocks'].max()
max_value_year = yearly_counts[yearly_counts['count_of_stocks'] == max_value_stocks]['year_added'].values[1]

print("The most recent year with the highest number of stock additions: ", max_value_year)

The most recent year with the highest number of stock additions:  2017


*Additional*: How many current S&P 500 stocks have been in the index for more than 20 years? When stocks are added to the S&P 500, they usually experience a price bump as investors and index funds buy shares following the announcement.

In [7]:
current_year = datetime.now().year
stocks_over_20_years = yearly_counts[current_year - yearly_counts['year_added'] > 20]
total_stocks_over_20_years = stocks_over_20_years['count_of_stocks'].sum()

print("Total current S&P 500 stocks in the index for more than 20 years:", total_stocks_over_20_years)

Total current S&P 500 stocks in the index for more than 20 years: 219


### Question 2. [Macro] Indexes YTD (as of 1 May 2025)

**How many indexes (out of 10) have better year-to-date returns than the US (S&P 500) as of May 1, 2025?**

Using Yahoo Finance World Indices data, compare the year-to-date (YTD) performance (1 January-1 May 2025) of major stock market indexes for the following countries:
* United States - S&P 500 (^GSPC)
* China - Shanghai Composite (000001.SS)
* Hong Kong - HANG SENG INDEX (^HSI)
* Australia - S&P/ASX 200 (^AXJO)
* India - Nifty 50 (^NSEI)
* Canada - S&P/TSX Composite (^GSPTSE)
* Germany - DAX (^GDAXI)
* United Kingdom - FTSE 100 (^FTSE)
* Japan - Nikkei 225 (^N225)
* Mexico - IPC Mexico (^MXX)
* Brazil - Ibovespa (^BVSP)


In [8]:
indices = {
    "S&P 500 (US)": "^GSPC",
    "Shanghai Composite (China)": "000001.SS",
    "Hang Seng Index (Hong Kong)": "^HSI",
    "S&P/ASX 200 (Australia)": "^AXJO",
    "Nifty 50 (India)": "^NSEI",
    "S&P/TSX Composite (Canada)": "^GSPTSE",
    "DAX (Germany)": "^GDAXI",
    "FTSE 100 (UK)": "^FTSE",
    "Nikkei 225 (Japan)": "^N225",
    "IPC Mexico": "^MXX",
    "Ibovespa (Brazil)": "^BVSP"
}

data = yf.download(list(indices.values()), start="2025-01-01", end="2025-05-01", group_by='ticker', progress=False)

returns = {}
for name, symbol in indices.items():
    close_prices = data[symbol]['Close'].dropna()
    first = close_prices.iloc[0]
    last = close_prices.iloc[-1]
    ytd_return = ((last - first) / first) * 100
    returns[name] = round(ytd_return, 2)


returns_df = pd.DataFrame.from_dict(returns, orient='index', columns=['YTD % Return'])
returns_df

YF.download() has changed argument auto_adjust default to True


Unnamed: 0,YTD % Return
S&P 500 (US),-5.1
Shanghai Composite (China),0.5
Hang Seng Index (Hong Kong),12.72
S&P/ASX 200 (Australia),-0.91
Nifty 50 (India),2.49
S&P/TSX Composite (Canada),-0.23
DAX (Germany),12.35
FTSE 100 (UK),2.84
Nikkei 225 (Japan),-8.3
IPC Mexico,13.05


In [9]:
better_than_sp500 = returns_df[returns_df['YTD % Return'] > -5.10]

print("Number of indices that outperformed the S&P 500 YTD as of May 1, 2025:", len(better_than_sp500))
print("\nOutperformers:\n", better_than_sp500)

Number of indices that outperformed the S&P 500 YTD as of May 1, 2025: 9

Outperformers:
                              YTD % Return
Shanghai Composite (China)           0.50
Hang Seng Index (Hong Kong)         12.72
S&P/ASX 200 (Australia)             -0.91
Nifty 50 (India)                     2.49
S&P/TSX Composite (Canada)          -0.23
DAX (Germany)                       12.35
FTSE 100 (UK)                        2.84
IPC Mexico                          13.05
Ibovespa (Brazil)                   12.44


Reference: Yahoo Finance World Indices - https://finance.yahoo.com/world-indices/

*Additional*: How many of these indexes have better returns than the S&P 500 over 3, 5, and 10 year periods? Do you see the same trend?
Note: For simplicity, ignore currency conversion effects.

In [10]:
end_date = "2025-05-01"
periods = {
    "3Y": "2022-05-01",
    "5Y": "2020-05-01",
    "10Y": "2015-05-01"
}

data = yf.download(
    tickers=list(indices.values()),
    start="2015-05-01", #earliest start
    end=end_date,
    group_by="ticker",
    progress=False
)

results = {}

for name, symbol in indices.items():
    try:
        df = data[symbol]['Close'].dropna()
        df = df.sort_index()
        row = {}
        for label, start_date in periods.items():
            start_price = df[df.index >= start_date].iloc[0]
            end_price = df[df.index <= end_date].iloc[-1]
            change = ((end_price - start_price) / start_price) * 100
            row[label] = round(change, 2)
        results[name] = row
    except Exception as e:
        results[name] = {label: None for label in periods}

returns_df = pd.DataFrame(results).T
returns_df.columns = ["3-Year %", "5-Year %", "10-Year %"]
returns_df = returns_df.sort_values(by="10-Year %", ascending=False)

returns_df

Unnamed: 0,3-Year %,5-Year %,10-Year %
Nifty 50 (India),42.56,161.84,192.06
S&P 500 (US),34.02,96.74,164.15
Ibovespa (Brazil),26.66,71.24,135.5
DAX (Germany),61.4,114.94,93.61
Nikkei 225 (Japan),34.4,83.72,84.55
S&P/TSX Composite (Canada),20.05,69.91,61.94
S&P/ASX 200 (Australia),10.61,54.91,39.76
IPC Mexico,8.43,54.68,24.36
FTSE 100 (UK),12.35,47.4,21.6
Hang Seng Index (Hong Kong),4.82,-6.33,-21.35


In [11]:
sp500_3y = returns_df.loc["S&P 500 (US)", "3-Year %"]
sp500_5y = returns_df.loc["S&P 500 (US)", "5-Year %"]
sp500_10y = returns_df.loc["S&P 500 (US)", "10-Year %"]

outperformers = {
    "3Y": returns_df[returns_df["3-Year %"] > sp500_3y],
    "5Y": returns_df[returns_df["5-Year %"] > sp500_5y],
    "10Y": returns_df[returns_df["10-Year %"] > sp500_10y],
}

print("Outperformers over 3 years:\n", outperformers["3Y"])
print("\nOutperformers over 5 years:\n", outperformers["5Y"])
print("\nOutperformers over 10 years:\n", outperformers["10Y"])

Outperformers over 3 years:
                     3-Year %  5-Year %  10-Year %
Nifty 50 (India)       42.56    161.84     192.06
DAX (Germany)          61.40    114.94      93.61
Nikkei 225 (Japan)     34.40     83.72      84.55

Outperformers over 5 years:
                   3-Year %  5-Year %  10-Year %
Nifty 50 (India)     42.56    161.84     192.06
DAX (Germany)        61.40    114.94      93.61

Outperformers over 10 years:
                   3-Year %  5-Year %  10-Year %
Nifty 50 (India)     42.56    161.84     192.06


India and Germany indices seem to be doing better than US's S&P 500 for the last 10 years.

### Question 3. [Index] S&P 500 Market Corrections Analysis


**Calculate the median duration (in days) of significant market corrections in the S&P 500 index.**

For this task, define a correction as an event when a stock index goes down by **more than 5%** from the closest all-time high maximum.

In [12]:
sp500_data = yf.Ticker("^GSPC")

sp500_df = sp500_data.history(
    start="1950-01-01",
    end=None,
    interval="1d"
)


sp500_df.index = sp500_df.index.date

sp500_df.head()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
1950-01-03,16.66,16.66,16.66,16.66,1260000,0.0,0.0
1950-01-04,16.85,16.85,16.85,16.85,1890000,0.0,0.0
1950-01-05,16.93,16.93,16.93,16.93,2550000,0.0,0.0
1950-01-06,16.98,16.98,16.98,16.98,2010000,0.0,0.0
1950-01-09,17.08,17.08,17.08,17.08,2520000,0.0,0.0


In [13]:
sp500_df["AllTimeHigh"] = sp500_df["Close"].cummax()
all_time_highs = sp500_df[sp500_df["Close"] == sp500_df["AllTimeHigh"]].reset_index(names="Date")
all_time_highs

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,AllTimeHigh
0,1950-01-03,16.660000,16.660000,16.660000,16.660000,1260000,0.0,0.0,16.660000
1,1950-01-04,16.850000,16.850000,16.850000,16.850000,1890000,0.0,0.0,16.850000
2,1950-01-05,16.930000,16.930000,16.930000,16.930000,2550000,0.0,0.0,16.930000
3,1950-01-06,16.980000,16.980000,16.980000,16.980000,2010000,0.0,0.0,16.980000
4,1950-01-09,17.080000,17.080000,17.080000,17.080000,2520000,0.0,0.0,17.080000
...,...,...,...,...,...,...,...,...,...
1469,2024-12-04,6069.390137,6089.839844,6061.060059,6086.490234,4003390000,0.0,0.0,6086.490234
1470,2024-12-06,6081.379883,6099.970215,6079.979980,6090.270020,3924830000,0.0,0.0,6090.270020
1471,2025-01-23,6076.319824,6118.729980,6074.669922,6118.709961,4432250000,0.0,0.0,6118.709961
1472,2025-02-18,6121.600098,6129.629883,6099.509766,6129.580078,4684980000,0.0,0.0,6129.580078


In [14]:
results = []

for i in range(len(all_time_highs) - 1):
    high1_date = all_time_highs.loc[i, "Date"]
    high1_value = all_time_highs.loc[i, "Close"]
    high2_date = all_time_highs.loc[i + 1, "Date"]

    subset = sp500_df.loc[(sp500_df.index > high1_date) & (sp500_df.index < high2_date)]

    if not subset.empty:
        min_close = subset["Close"].min()
        min_date = subset["Close"].idxmin()
        drawdown = ((high1_value - min_close) / high1_value) * 100
        total_duration = (min_date - high1_date).days


        results.append({
            "High 1 Date": high1_date,
            "High 1 Value": high1_value,
            "Min Date Between": min_date,
            "Min Close Between": min_close,
            "Drawdown (%)": round(drawdown, 1),
            "Total Duration (days)": total_duration

        })

drawdowns_df = pd.DataFrame(results)

corrections = drawdowns_df[drawdowns_df['Drawdown (%)'] >= 5].sort_values(by='Drawdown (%)', ascending=False)
corrections.head(10)

Unnamed: 0,High 1 Date,High 1 Value,Min Date Between,Min Close Between,Drawdown (%),Total Duration (days)
448,2007-10-09,1565.150024,2009-03-09,676.530029,56.8,517
443,2000-03-24,1527.459961,2002-10-09,776.76001,49.1,929
206,1973-01-11,120.239998,1974-10-03,62.279999,48.2,630
193,1968-11-29,108.370003,1970-05-26,69.290001,36.1,543
574,2020-02-19,3386.149902,2020-03-23,2237.399902,33.9,33
292,1987-08-25,336.769989,1987-12-04,223.919998,33.5,101
133,1961-12-12,72.639999,1962-06-26,52.32,28.0,196
219,1980-11-28,140.520004,1982-08-12,102.419998,27.1,622
620,2022-01-03,4796.560059,2022-10-12,3577.030029,25.4,282
176,1966-02-09,94.059998,1966-10-07,73.199997,22.2,240


In [15]:
durations = corrections["Total Duration (days)"]

print("Correction Duration Summary (in days):")
print(f"25th Percentile: {durations.quantile(0.25)}")
print(f"Median:          {durations.median()}")
print(f"75th Percentile: {durations.quantile(0.75)}")

Correction Duration Summary (in days):
25th Percentile: 21.0
Median:          39.0
75th Percentile: 87.0


### Question 4.  [Stocks] Earnings Surprise Analysis for Amazon (AMZN)


**Calculate the median 2-day percentage change in stock prices following positive earnings surprises days.**

**Earnings Surprise**

In [120]:
url = "https://raw.githubusercontent.com/DataTalksClub/stock-markets-analytics-zoomcamp/refs/heads/main/cohorts/2025/ha1_Amazon.csv"
earnings_df = pd.read_csv(url, sep=';')
earnings_df.head()

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%)
0,AMZN,Amazon.com Inc,"April 29, 2026 at 6 AM EDT",-,-,-
1,AMZN,Amazon.com Inc,"February 4, 2026 at 4 PM EST",-,-,-
2,AMZN,Amazon.com Inc,"October 29, 2025 at 6 AM EDT",-,-,-
3,AMZN,Amazon.com Inc,"July 30, 2025 at 4 PM EDT",-,-,-
4,AMZN,"Amazon.com, Inc.","May 1, 2025 at 4 PM EDT",???.36,???.59,+16.74


In [121]:
earnings_df = earnings_df.dropna()

In [122]:
earnings_df = earnings_df.rename(columns={
    'Earnings Date': 'Earnings_Date',
    'Surprise (%)': 'Surprise_Change',
})

In [123]:
earnings_df['Earnings_Date'] = earnings_df['Earnings_Date'].str.split(' at').str[0]
earnings_df['Earnings_Date'] = pd.to_datetime(earnings_df['Earnings_Date'])

In [124]:
earnings_df['Type_Surprise'] = earnings_df['Surprise_Change'].apply(
    lambda x: 'positive' if '+' in x else ('negative' if '-' in x else 'NaN')
)

In [125]:
earnings_df['Surprise_Change'] = earnings_df['Surprise_Change'].str.replace('+', '', regex=False)
earnings_df['Surprise_Change'] = earnings_df['Surprise_Change'].str.replace('-', '', regex=False)

# Optional: Convert back to float
earnings_df['Surprise_Change'] = pd.to_numeric(earnings_df['Surprise_Change'], errors='coerce')

In [126]:
earnings_df = earnings_df.dropna(subset=["Surprise_Change"]).reset_index(drop=True)
earnings_df

Unnamed: 0,Symbol,Company,Earnings_Date,EPS Estimate,Reported EPS,Surprise_Change,Type_Surprise
0,AMZN,"Amazon.com, Inc.",2025-05-01,???.36,???.59,16.74,positive
1,AMZN,"Amazon.com, Inc.",2025-02-06,???.49,???.86,24.47,positive
2,AMZN,"Amazon.com, Inc.",2024-10-31,???.14,???.43,25.17,positive
3,AMZN,"Amazon.com, Inc.",2024-08-01,01.???,???.26,22.58,positive
4,AMZN,"Amazon.com, Inc.",2024-04-30,0.83,0.98,17.91,positive
...,...,...,...,...,...,...,...
107,AMZN,"Amazon.com, Inc.",1998-07-22,-,-,1.34,positive
108,AMZN,"Amazon.com, Inc.",1998-04-27,-,-,13.92,positive
109,AMZN,"Amazon.com, Inc.",1998-01-22,-,-,11.41,positive
110,AMZN,"Amazon.com, Inc.",1997-10-27,-,-,13.29,positive


In [127]:
earnings_df = earnings_df[['Earnings_Date', 'EPS Estimate', 'Reported EPS', 'Surprise_Change', 'Type_Surprise']]

**Historical Price Data**

In [128]:
amzn = yf.Ticker("AMZN")
price_df = amzn.history(period="max")
price_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1997-05-15 00:00:00-04:00,0.121875,0.125000,0.096354,0.097917,1443120000,0.0,0.0
1997-05-16 00:00:00-04:00,0.098438,0.098958,0.085417,0.086458,294000000,0.0,0.0
1997-05-19 00:00:00-04:00,0.088021,0.088542,0.081250,0.085417,122136000,0.0,0.0
1997-05-20 00:00:00-04:00,0.086458,0.087500,0.081771,0.081771,109344000,0.0,0.0
1997-05-21 00:00:00-04:00,0.081771,0.082292,0.068750,0.071354,377064000,0.0,0.0
...,...,...,...,...,...,...,...
2025-05-19 00:00:00-04:00,201.649994,206.619995,201.259995,206.160004,34314800,0.0,0.0
2025-05-20 00:00:00-04:00,204.630005,205.589996,202.649994,204.070007,29470400,0.0,0.0
2025-05-21 00:00:00-04:00,201.610001,203.460007,200.059998,201.119995,42460900,0.0,0.0
2025-05-22 00:00:00-04:00,201.380005,205.759995,200.160004,203.100006,38938900,0.0,0.0


In [129]:
price_df = price_df[['Close']]
price_df.reset_index(inplace=True)

price_df = price_df.rename(columns={
    'Date': 'Price_Date',
})
price_df['Price_Date'] = pd.to_datetime(price_df['Price_Date']).dt.date

price_df

Unnamed: 0,Price_Date,Close
0,1997-05-15,0.097917
1,1997-05-16,0.086458
2,1997-05-19,0.085417
3,1997-05-20,0.081771
4,1997-05-21,0.071354
...,...,...
7046,2025-05-19,206.160004
7047,2025-05-20,204.070007
7048,2025-05-21,201.119995
7049,2025-05-22,203.100006


In [130]:
price_df["Close_Day3"] = price_df["Close"].shift(-2) # Two days ahead
price_df["2D_Return"] = ((price_df["Close_Day3"] / price_df["Close"]) - 1)*100

price_df

Unnamed: 0,Price_Date,Close,Close_Day3,2D_Return
0,1997-05-15,0.097917,0.085417,-12.765910
1,1997-05-16,0.086458,0.081771,-5.421125
2,1997-05-19,0.085417,0.071354,-16.463936
3,1997-05-20,0.081771,0.069792,-14.649446
4,1997-05-21,0.071354,0.075000,5.109736
...,...,...,...,...
7046,2025-05-19,206.160004,201.119995,-2.444707
7047,2025-05-20,204.070007,203.100006,-0.475328
7048,2025-05-21,201.119995,200.990005,-0.064633
7049,2025-05-22,203.100006,,


In [131]:
price_df = price_df[["Price_Date", "2D_Return"]]
price_df = price_df.dropna(subset=["2D_Return"])

**Merge Dataframes**

In [132]:
earnings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Earnings_Date    112 non-null    datetime64[ns]
 1   EPS Estimate     112 non-null    object        
 2   Reported EPS     112 non-null    object        
 3   Surprise_Change  112 non-null    float64       
 4   Type_Surprise    112 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 4.5+ KB


In [133]:
price_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7049 entries, 0 to 7048
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Price_Date  7049 non-null   object 
 1   2D_Return   7049 non-null   float64
dtypes: float64(1), object(1)
memory usage: 165.2+ KB


In [134]:
price_df['Price_Date'] = pd.to_datetime(price_df['Price_Date'])
price_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7049 entries, 0 to 7048
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Price_Date  7049 non-null   datetime64[ns]
 1   2D_Return   7049 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 165.2 KB


In [135]:
earnings_df["Day1_Date"] = earnings_df["Earnings_Date"] - pd.Timedelta(days=1) # Effect/Return from Day 1
earnings_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  earnings_df["Day1_Date"] = earnings_df["Earnings_Date"] - pd.Timedelta(days=1) # Effect/Return from Day 1


Unnamed: 0,Earnings_Date,EPS Estimate,Reported EPS,Surprise_Change,Type_Surprise,Day1_Date
0,2025-05-01,???.36,???.59,16.74,positive,2025-04-30
1,2025-02-06,???.49,???.86,24.47,positive,2025-02-05
2,2024-10-31,???.14,???.43,25.17,positive,2024-10-30
3,2024-08-01,01.???,???.26,22.58,positive,2024-07-31
4,2024-04-30,0.83,0.98,17.91,positive,2024-04-29
...,...,...,...,...,...,...
107,1998-07-22,-,-,1.34,positive,1998-07-21
108,1998-04-27,-,-,13.92,positive,1998-04-26
109,1998-01-22,-,-,11.41,positive,1998-01-21
110,1997-10-27,-,-,13.29,positive,1997-10-26


In [136]:
merged_df = earnings_df.merge(
    price_df,
    left_on="Day1_Date",
    right_on="Price_Date",
    how="left"
)
merged_df

Unnamed: 0,Earnings_Date,EPS Estimate,Reported EPS,Surprise_Change,Type_Surprise,Day1_Date,Price_Date,2D_Return
0,2025-05-01,???.36,???.59,16.74,positive,2025-04-30,2025-04-30,3.014856
1,2025-02-06,???.49,???.86,24.47,positive,2025-02-05,2025-02-05,-2.972437
2,2024-10-31,???.14,???.43,25.17,positive,2024-10-30,2024-10-30,2.698074
3,2024-08-01,01.???,???.26,22.58,positive,2024-07-31,2024-07-31,-10.204301
4,2024-04-30,0.83,0.98,17.91,positive,2024-04-29,2024-04-29,-1.083116
...,...,...,...,...,...,...,...,...
107,1998-07-22,-,-,1.34,positive,1998-07-21,1998-07-21,-3.360132
108,1998-04-27,-,-,13.92,positive,1998-04-26,NaT,
109,1998-01-22,-,-,11.41,positive,1998-01-21,1998-01-21,-2.669462
110,1997-10-27,-,-,13.29,positive,1997-10-26,NaT,


In [147]:
positive_surprise_returns = merged_df[merged_df["Type_Surprise"] == 'positive']
negative_returns = merged_df[merged_df["Type_Surprise"] == "negative"]
positive_surprise_returns[["Day1_Date", "EPS Estimate", "Reported EPS", "Surprise_Change", "2D_Return"]].head()

Unnamed: 0,Day1_Date,EPS Estimate,Reported EPS,Surprise_Change,2D_Return
0,2025-04-30,???.36,???.59,16.74,3.014856
1,2025-02-05,???.49,???.86,24.47,-2.972437
2,2024-10-30,???.14,???.43,25.17,2.698074
3,2024-07-31,01.???,???.26,22.58,-10.204301
4,2024-04-29,0.83,0.98,17.91,-1.083116


In [148]:
positive_median = positive_surprise_returns['2D_Return'].median()
negative_median = negative_returns['2D_Return'].median()

In [149]:
overall_median = merged_df["2D_Return"].median()

In [150]:
print("Median 2-Day Return Comparison")
print(f"All historical dates:        {overall_median:.2}")
print(f"After positive surprises:    {positive_median:.2}")
print(f"After negative surprises:    {negative_median:.2}")

Median 2-Day Return Comparison
All historical dates:        -0.32
After positive surprises:    1.5
After negative surprises:    -4.5
