# S&P 500 Prices Scraping

## Import Libraries

In [9]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Make The Request From Yahoo Finance and Get The Table HTML

In [10]:
url = "https://finance.yahoo.com/quote/%5EGSPC/history/?period1=-1325583000&period2=1745262138"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
r = requests.get(url, headers=headers)

c = r.text
soup=BeautifulSoup(c, "lxml")

table = soup.find("table" , {"class":"table yf-1jecxey noDl hideOnPrint"})

## Get The Headers of The Table

In [11]:
headers = table.find_all("th" , {"class":"yf-1jecxey"})

titles = []

for i in headers:
    titles.append(i.text.split("  ")[0])
    
df = pd.DataFrame(columns = titles)
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume


## Get The Content of The Table

In [12]:
rows_con = table.find_all("tr" , {"class":"yf-1jecxey"})
for i in rows_con[1:]:
    data = i.find_all("td" , {"class":"yf-1jecxey"})
    row = [tr.text for tr in data]
    new_row_df = pd.DataFrame([row], columns=df.columns)  # Ensure columns match
    df = pd.concat([df, new_row_df], ignore_index=True)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Apr 21, 2025",5232.94,5232.94,5101.63,5116.57,5116.57,1789812000
1,"Apr 17, 2025",5305.45,5328.31,5255.58,5282.7,5282.7,4714880000
2,"Apr 16, 2025",5335.75,5367.24,5220.79,5275.7,5275.7,4607750000
3,"Apr 15, 2025",5411.99,5450.41,5386.44,5396.63,5396.63,4317110000
4,"Apr 14, 2025",5441.96,5459.46,5358.02,5405.97,5405.97,5031440000


## Data Preprocessing

In [13]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Apr 21, 2025",5232.94,5232.94,5101.63,5116.57,5116.57,1789812000
1,"Apr 17, 2025",5305.45,5328.31,5255.58,5282.7,5282.7,4714880000
2,"Apr 16, 2025",5335.75,5367.24,5220.79,5275.7,5275.7,4607750000
3,"Apr 15, 2025",5411.99,5450.41,5386.44,5396.63,5396.63,4317110000
4,"Apr 14, 2025",5441.96,5459.46,5358.02,5405.97,5405.97,5031440000
5,"Apr 11, 2025",5255.56,5381.46,5220.77,5363.36,5363.36,5602550000
6,"Apr 10, 2025",5353.15,5353.15,5115.27,5268.05,5268.05,6677140000
7,"Apr 9, 2025",4965.28,5481.34,4948.43,5456.9,5456.9,9489600000
8,"Apr 8, 2025",5193.57,5267.47,4910.42,4982.77,4982.77,7408140000
9,"Apr 7, 2025",4953.79,5246.57,4835.04,5062.25,5062.25,8691980000


In [14]:
df.shape

(24441, 7)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24441 entries, 0 to 24440
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       24441 non-null  object
 1   Open       24441 non-null  object
 2   High       24441 non-null  object
 3   Low        24441 non-null  object
 4   Close      24441 non-null  object
 5   Adj Close  24441 non-null  object
 6   Volume     24441 non-null  object
dtypes: object(7)
memory usage: 1.3+ MB


## Date Format

In [16]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24441 entries, 0 to 24440
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       24441 non-null  datetime64[ns]
 1   Open       24441 non-null  object        
 2   High       24441 non-null  object        
 3   Low        24441 non-null  object        
 4   Close      24441 non-null  object        
 5   Adj Close  24441 non-null  object        
 6   Volume     24441 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 1.3+ MB


## Numeric Data Format

In [17]:
numbers = df.select_dtypes(include = ["object"]).columns
numbers

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [18]:
for col in numbers:
    df[col] = df[col].str.replace("," , "")
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24441 entries, 0 to 24440
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       24441 non-null  datetime64[ns]
 1   Open       24441 non-null  float64       
 2   High       24441 non-null  float64       
 3   Low        24441 non-null  float64       
 4   Close      24441 non-null  float64       
 5   Adj Close  24441 non-null  float64       
 6   Volume     18945 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 1.3 MB


In [20]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2025-04-21,5232.94,5232.94,5101.63,5116.57,5116.57,1789812000.0
1,2025-04-17,5305.45,5328.31,5255.58,5282.7,5282.7,4714880000.0
2,2025-04-16,5335.75,5367.24,5220.79,5275.7,5275.7,4607750000.0
3,2025-04-15,5411.99,5450.41,5386.44,5396.63,5396.63,4317110000.0
4,2025-04-14,5441.96,5459.46,5358.02,5405.97,5405.97,5031440000.0


In [21]:
df = df.drop(columns = ['Open', 'High', 'Low', 'Close', 'Volume'])
df = df.set_index('Date')
df = df.asfreq('b', method = 'ffill')

In [22]:
df.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
1927-12-30,17.66
1928-01-02,17.76
1928-01-03,17.76
1928-01-04,17.72
1928-01-05,17.55


In [23]:
df.tail()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2025-04-15,5396.63
2025-04-16,5275.7
2025-04-17,5282.7
2025-04-18,5116.57
2025-04-21,5116.57


# Save The Data In CSV File

In [24]:
df.to_csv("S&P500_close.csv")