In [None]:
# install the yfinance library
!pip install yfinance



In [None]:
import pandas as pd
import numpy as np

from datetime import datetime as dt

#fin data sources
import yfinance as yf

## Question 1

In [None]:
sp_500_df = pd.read_html(io="https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0] # Return first table containing date added feature

In [None]:
sp_500_df['Date added'] = pd.to_datetime(sp_500_df['Date added'])

In [None]:
sp_500_df['Year added'] = sp_500_df['Date added'].apply(lambda x: x.year)

In [None]:
yearly_additions_sp500 = sp_500_df.groupby('Year added').agg(NUM_OF_COMPANIES_ADDED= ('Symbol', 'count')).reset_index()

In [None]:
# 2017 was the most recent year with the highest number of additions (23) to the S&P 500
yearly_additions_sp500.query('`Year added` != 1957').sort_values(by=['NUM_OF_COMPANIES_ADDED', 'Year added'], ascending=False).head(n=1)

Unnamed: 0,Year added,NUM_OF_COMPANIES_ADDED
48,2017,23


In [None]:

current_year = dt.now().year

sp_500_stocks_gr_20_years =  sp_500_df.query(' @current_year -  `Year added` >= 20').shape[0]

print(f"There are {sp_500_stocks_gr_20_years} stocks that have been in the index for more than 20 years")

There are 226 stocks that have been in the index for more than 20 years


## Question 2

In [None]:
def get_ytd_return(ticker_symbol: yf.ticker.Ticker, start_dt: str, end_dt: str) -> float:

  ticker = yf.Ticker(ticker=ticker_symbol)

  ticker_df = ticker.history(start=start_dt, end=end_dt).reset_index().sort_values(by='Date')


  return (ticker_df.tail(n=1)['Close'].values[0] - ticker_df.head(n=1)['Close'].values[0]) / ticker_df.head(n=1)['Close'].values[0]


In [None]:
sp500_ytd_return = get_ytd_return(ticker_symbol="^GSPC", start_dt="2025-01-01", end_dt="2025-05-01")


In [None]:
world_indice_ytd_return_gr_sp500 = {}

for ticker_symbol in ["000001.SS", "^HSI", "^AXJO", "^NSEI", "^GSPTSE", "^GDAXI", "^FTSE", "^N225", "^MXX", "^BVSP"]:

  ticker_ytd_return = get_ytd_return(ticker_symbol=ticker_symbol, start_dt="2025-01-01", end_dt="2025-05-01")

  if ticker_ytd_return > sp500_ytd_return:
    world_indice_ytd_return_gr_sp500[ticker_symbol] = ticker_ytd_return


In [None]:
print(f"The following world indices have a better YTD Return % than the S&P 500 YTD Return % ({sp500_ytd_return:.2%})\n{world_indice_ytd_return_gr_sp500}")

The following world indices have a better YTD Return % than the S&P 500 YTD Return % (-5.10%)
{'000001.SS': np.float64(0.00504817244050002), '^HSI': np.float64(0.12720017835921466), '^AXJO': np.float64(-0.009145002952478493), '^NSEI': np.float64(0.024904237409784914), '^GSPTSE': np.float64(-0.002261257179291509), '^GDAXI': np.float64(0.1234637838149953), '^FTSE': np.float64(0.028425901908435483), '^MXX': np.float64(0.1304944445757032), '^BVSP': np.float64(0.12438709677419355)}


In [None]:
# 3 years

sp500_3_yr_return = get_ytd_return(ticker_symbol="^GSPC", start_dt="2022-05-01", end_dt="2025-05-01")

world_indice_3yr_return_gr_sp500 = {}

for ticker_symbol in ["000001.SS", "^HSI", "^AXJO", "^NSEI", "^GSPTSE", "^GDAXI", "^FTSE", "^N225", "^MXX", "^BVSP"]:

  ticker_3yr_return = get_ytd_return(ticker_symbol=ticker_symbol, start_dt="2022-05-01", end_dt="2025-05-01")

  if ticker_3yr_return > sp500_3_yr_return:
    world_indice_3yr_return_gr_sp500[ticker_symbol] = ticker_3yr_return


In [None]:
print(f"The following world indices have a better 3yr Return % than the S&P 500 YTD Return % ({sp500_3_yr_return:.2%})\n{world_indice_3yr_return_gr_sp500}")

The following world indices have a better 3yr Return % than the S&P 500 YTD Return % (34.02%)
{'^NSEI': np.float64(0.4256287546289044), '^GDAXI': np.float64(0.6139512868785524), '^N225': np.float64(0.3440475615659561)}


In [None]:
# 5 years

sp500_5_yr_return = get_ytd_return(ticker_symbol="^GSPC", start_dt="2020-05-01", end_dt="2025-05-01")

world_indice_5yr_return_gr_sp500 = {}

for ticker_symbol in ["000001.SS", "^HSI", "^AXJO", "^NSEI", "^GSPTSE", "^GDAXI", "^FTSE", "^N225", "^MXX", "^BVSP"]:

  ticker_5yr_return = get_ytd_return(ticker_symbol=ticker_symbol, start_dt="2020-05-01", end_dt="2025-05-01")

  if ticker_5yr_return > sp500_5_yr_return:
    world_indice_5yr_return_gr_sp500[ticker_symbol] = ticker_5yr_return


In [None]:
print(f"The following world indices have a better 5yr Return % than the S&P 500 YTD Return % ({sp500_5_yr_return:.2%})\n{world_indice_5yr_return_gr_sp500}")

The following world indices have a better 5yr Return % than the S&P 500 YTD Return % (96.74%)
{'^NSEI': np.float64(1.6184106331037822), '^GDAXI': np.float64(1.1493656980689406)}


In [None]:
# 10 years

sp500_10_yr_return = get_ytd_return(ticker_symbol="^GSPC", start_dt="2015-05-01", end_dt="2025-05-01")

world_indice_10yr_return_gr_sp500 = {}

for ticker_symbol in ["000001.SS", "^HSI", "^AXJO", "^NSEI", "^GSPTSE", "^GDAXI", "^FTSE", "^N225", "^MXX", "^BVSP"]:

  ticker_10yr_return = get_ytd_return(ticker_symbol=ticker_symbol, start_dt="2015-05-01", end_dt="2025-05-01")

  if ticker_10yr_return > sp500_10_yr_return:
    world_indice_10yr_return_gr_sp500[ticker_symbol] = ticker_10yr_return


In [None]:
print(f"The following world indices have a better 10yr Return % than the S&P 500 YTD Return % ({sp500_10_yr_return:.2%})\n{world_indice_10yr_return_gr_sp500}")

The following world indices have a better 10yr Return % than the S&P 500 YTD Return % (164.15%)
{'^NSEI': np.float64(1.920588655515519)}


## Question 3

In [None]:
# correction = stock index goes down by more than 5% from the closest all-time high max

ticker = yf.Ticker(ticker="^GSPC")

ticker_df = ticker.history(start="1950-01-01", end=dt.now().strftime("%Y-%m-%d")).reset_index().sort_values(by='Date')

In [None]:
ticker_df['Close_High_Points'] = ticker_df['Close'].cummax()

In [None]:
sp500_high_points = ticker_df.groupby('Close_High_Points').agg(FIRST_DT = ('Date', 'min')).reset_index()

In [None]:
sp500_high_points.shape

(1447, 2)

In [89]:
drawdown_lst = []

for idx, row in sp500_high_points.iterrows():
  if idx == 0:
    continue
  else:
    prev_high_point = sp500_high_points.loc[idx-1, 'FIRST_DT']
    current_high_point = sp500_high_points.loc[idx, 'FIRST_DT']
    drawdown_dt_range =  ticker_df.query('Date >= @prev_high_point and Date <= @current_high_point')

    if drawdown_dt_range.shape[0] > 2: # Filter out dates in the early stages where the price is increasing each consecutive day
      min_price = drawdown_dt_range['Close'].min()



      prev_high_price = sp500_high_points.loc[idx-1, 'Close_High_Points']

      drawdown = (prev_high_price - min_price) / prev_high_price * 100.0

      if drawdown >= 5.00:
        min_price_dt = drawdown_dt_range.query('Close == @min_price').sort_values(by='Date', ascending=False).reset_index().loc[0, 'Date']  #Get the last date in the drawdown range with the lowest price, in case the lowest price appears more than once

        drawdown_duration = min_price_dt - prev_high_point

        drawdown_lst.append([prev_high_point, min_price_dt, drawdown_duration, drawdown])



In [90]:
drawdown_df = pd.DataFrame(drawdown_lst, columns=["Max_Close", "Min_Close", "Drawdown_Duration", "Drawdown_Pct"])

In [93]:
drawdown_df.sort_values(by='Drawdown_Pct', ascending=False).head(n=10)

Unnamed: 0,Max_Close,Min_Close,Drawdown_Duration,Drawdown_Pct
56,2007-10-09 00:00:00-04:00,2009-03-09 00:00:00-04:00,517 days 00:00:00,56.775388
54,2000-03-24 00:00:00-05:00,2002-10-09 00:00:00-04:00,928 days 23:00:00,49.146948
24,1973-01-11 00:00:00-05:00,1974-10-03 00:00:00-04:00,629 days 23:00:00,48.203593
22,1968-11-29 00:00:00-05:00,1970-05-26 00:00:00-04:00,542 days 23:00:00,36.061641
65,2020-02-19 00:00:00-05:00,2020-03-23 00:00:00-04:00,32 days 23:00:00,33.92496
35,1987-08-25 00:00:00-04:00,1987-12-04 00:00:00-05:00,101 days 01:00:00,33.509515
15,1961-12-12 00:00:00-05:00,1962-06-26 00:00:00-04:00,195 days 23:00:00,27.973568
27,1980-11-28 00:00:00-05:00,1982-08-12 00:00:00-04:00,621 days 23:00:00,27.113582
68,2022-01-03 00:00:00-05:00,2022-10-12 00:00:00-04:00,281 days 23:00:00,25.425097
18,1966-02-09 00:00:00-05:00,1966-10-07 00:00:00-04:00,239 days 23:00:00,22.177335


In [95]:
drawdown_df['Drawdown_Duration'].describe()

Unnamed: 0,Drawdown_Duration
count,71
mean,113 days 02:38:01.690140846
std,179 days 02:11:26.722984390
min,7 days 00:00:00
25%,21 days 12:00:00
50%,39 days 00:00:00
75%,89 days 00:00:00
max,928 days 23:00:00


## Question 4

In [100]:
aws_earnings = pd.read_csv(r'https://raw.githubusercontent.com/DataTalksClub/stock-markets-analytics-zoomcamp/refs/heads/main/cohorts/2025/ha1_Amazon.csv', delimiter=";", na_values="-")

In [135]:
aws_ticker_obj = yf.Ticker(ticker="AMZN")

aws_price_history = aws_ticker_obj.history(start="1997-05-15").reset_index()

In [136]:
aws_price_history['Close_2_day_lag'] = aws_price_history['Close'].shift(2)

In [137]:
aws_price_history['2_day_price_pct'] = aws_price_history['Close'] / aws_price_history['Close_2_day_lag'] - 1

In [138]:
aws_price_history['Date'] = pd.to_datetime(aws_price_history['Date']).dt.date

In [139]:
aws_positve_earnings_news = aws_earnings.query('`Surprise (%)` > 0').copy()

In [140]:
aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date

  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news['Earnings Date']).dt.date
  aws_positve_earnings_news['Earnings Date'] = pd.to_datetime(aws_positve_earnings_news[

In [146]:
price_2_day_change_after_eps_news = []

for earnings_date in aws_positve_earnings_news['Earnings Date'].values:
  price_after_positive_eps_news = aws_price_history[aws_price_history['Date'] > earnings_date].reset_index().loc[0, '2_day_price_pct'] * 100.0
  price_2_day_change_after_eps_news.append(price_after_positive_eps_news)

In [150]:
np.median(price_2_day_change_after_eps_news)

np.float64(1.0405227919253135)

## Question 5

ML-driven approach to derive investment strategy that over 1-year horizon can generate gains in the Europe/America stock markets.

## Question 6

Financial statements, social media sentiment, macro/sector factors