# Oil Prices Scraping

## Import Libraries

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

  from pandas.core import (


## Make The Request From Yahoo Finance and Get The Table HTML

In [2]:
url = "https://finance.yahoo.com/quote/CL%3DF/history/?period1=967003200&period2=1745260880"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
r = requests.get(url, headers=headers)

c = r.text
soup=BeautifulSoup(c, "lxml")

table = soup.find("table" , {"class":"table yf-1jecxey noDl hideOnPrint"})

## Get The Headers of The Table

In [3]:
headers = table.find_all("th" , {"class":"yf-1jecxey"})

titles = []

for i in headers:
    titles.append(i.text.split("  ")[0])
    
df = pd.DataFrame(columns = titles)
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume


## Get The Content of The Table

In [4]:
rows_con = table.find_all("tr" , {"class":"yf-1jecxey"})
for i in rows_con[1:]:
    data = i.find_all("td" , {"class":"yf-1jecxey"})
    row = [tr.text for tr in data]
    new_row_df = pd.DataFrame([row], columns=df.columns)  # Ensure columns match
    df = pd.concat([df, new_row_df], ignore_index=True)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Apr 21, 2025",64.3,64.42,62.45,63.11,63.11,35777
1,"Apr 17, 2025",62.63,64.86,62.61,64.68,64.68,213194
2,"Apr 16, 2025",61.54,62.98,60.44,62.47,62.47,213194
3,"Apr 15, 2025",61.58,62.06,60.88,61.33,61.33,197404
4,"Apr 14, 2025",61.7,62.68,60.59,61.53,61.53,238068


## Data Preprocessing

In [5]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Apr 21, 2025",64.3,64.42,62.45,63.11,63.11,35777
1,"Apr 17, 2025",62.63,64.86,62.61,64.68,64.68,213194
2,"Apr 16, 2025",61.54,62.98,60.44,62.47,62.47,213194
3,"Apr 15, 2025",61.58,62.06,60.88,61.33,61.33,197404
4,"Apr 14, 2025",61.7,62.68,60.59,61.53,61.53,238068
5,"Apr 11, 2025",60.2,61.87,59.43,61.5,61.5,306231
6,"Apr 10, 2025",62.71,63.34,58.76,60.07,60.07,391826
7,"Apr 9, 2025",58.32,62.93,55.12,62.35,62.35,592250
8,"Apr 8, 2025",61.03,61.75,57.88,59.58,59.58,557655
9,"Apr 7, 2025",61.12,63.9,58.95,60.7,60.7,597617


In [6]:
df.shape

(6190, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6190 entries, 0 to 6189
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       6190 non-null   object
 1   Open       6190 non-null   object
 2   High       6190 non-null   object
 3   Low        6190 non-null   object
 4   Close      6190 non-null   object
 5   Adj Close  6190 non-null   object
 6   Volume     6190 non-null   object
dtypes: object(7)
memory usage: 338.6+ KB


## Date Format

In [8]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6190 entries, 0 to 6189
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6190 non-null   datetime64[ns]
 1   Open       6190 non-null   object        
 2   High       6190 non-null   object        
 3   Low        6190 non-null   object        
 4   Close      6190 non-null   object        
 5   Adj Close  6190 non-null   object        
 6   Volume     6190 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 338.6+ KB


## Numeric Data Format

In [9]:
numbers = df.select_dtypes(include = ["object"]).columns
numbers

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [10]:
for col in numbers:
    df[col] = df[col].str.replace("," , "")
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6190 entries, 0 to 6189
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6190 non-null   datetime64[ns]
 1   Open       6190 non-null   float64       
 2   High       6190 non-null   float64       
 3   Low        6190 non-null   float64       
 4   Close      6190 non-null   float64       
 5   Adj Close  6190 non-null   float64       
 6   Volume     6183 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 338.6 KB


In [11]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2025-04-21,64.3,64.42,62.45,63.11,63.11,35777.0
1,2025-04-17,62.63,64.86,62.61,64.68,64.68,213194.0
2,2025-04-16,61.54,62.98,60.44,62.47,62.47,213194.0
3,2025-04-15,61.58,62.06,60.88,61.33,61.33,197404.0
4,2025-04-14,61.7,62.68,60.59,61.53,61.53,238068.0


In [12]:
df = df.drop(columns = ['Open', 'High', 'Low', 'Close', 'Volume'])
df = df.set_index('Date')
df = df.asfreq('b', method = 'ffill')

In [13]:
df.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2000-08-23,32.05
2000-08-24,31.63
2000-08-25,32.05
2000-08-28,32.87
2000-08-29,32.72


In [14]:
df.tail()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2025-04-15,61.33
2025-04-16,62.47
2025-04-17,64.68
2025-04-18,63.11
2025-04-21,63.11


# Save The Data In CSV File

In [15]:
df.to_csv("oil_close.csv")