# Gold Prices Scraping

## Import Libraries

In [20]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Make The Request From Yahoo Finance and Get The Table HTML

In [21]:
url = "https://finance.yahoo.com/quote/GC%3DF/history/?period1=967608000&period2=1746645893"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
r = requests.get(url, headers=headers)

c = r.text
soup=BeautifulSoup(c, "lxml")

table = soup.find("table" , {"class":"table yf-1jecxey noDl hideOnPrint"})

## Get The Headers of The Table

In [22]:
headers = table.find_all("th" , {"class":"yf-1jecxey"})

titles = []

for i in headers:
    titles.append(i.text.split("  ")[0])
    
df = pd.DataFrame(columns = titles)
df

AttributeError: 'NoneType' object has no attribute 'find_all'

## Get The Content of The Table

In [22]:
rows_con = table.find_all("tr" , {"class":"yf-1jecxey"})
for i in rows_con[1:]:
    data = i.find_all("td" , {"class":"yf-1jecxey"})
    row = [tr.text for tr in data]
    new_row_df = pd.DataFrame([row], columns=df.columns)  # Ensure columns match
    df = pd.concat([df, new_row_df], ignore_index=True)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Apr 21, 2025",3347.0,3442.3,3344.0,3430.8,3430.8,217966
1,"Apr 17, 2025",3345.0,3345.0,3287.8,3308.7,3308.7,1874
2,"Apr 16, 2025",3238.3,3334.9,3238.3,3326.6,3326.6,1874
3,"Apr 15, 2025",3216.0,3218.7,3214.0,3218.7,3218.7,390
4,"Apr 14, 2025",3215.5,3228.8,3194.5,3204.8,3204.8,263


## Data Preprocessing

In [23]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Apr 21, 2025",3347.0,3442.3,3344.0,3430.8,3430.8,217966
1,"Apr 17, 2025",3345.0,3345.0,3287.8,3308.7,3308.7,1874
2,"Apr 16, 2025",3238.3,3334.9,3238.3,3326.6,3326.6,1874
3,"Apr 15, 2025",3216.0,3218.7,3214.0,3218.7,3218.7,390
4,"Apr 14, 2025",3215.5,3228.8,3194.5,3204.8,3204.8,263
5,"Apr 11, 2025",3182.1,3235.0,3182.1,3222.2,3222.2,862
6,"Apr 10, 2025",3073.9,3167.0,3072.1,3155.2,3155.2,3456
7,"Apr 9, 2025",2965.8,3090.4,2965.8,3056.5,3056.5,2175
8,"Apr 8, 2025",2994.0,3014.5,2968.4,2968.4,2968.4,3213
9,"Apr 7, 2025",3016.4,3050.8,2949.7,2951.3,2951.3,4424


In [24]:
df.shape

(6181, 7)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6181 entries, 0 to 6180
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       6181 non-null   object
 1   Open       6181 non-null   object
 2   High       6181 non-null   object
 3   Low        6181 non-null   object
 4   Close      6181 non-null   object
 5   Adj Close  6181 non-null   object
 6   Volume     6181 non-null   object
dtypes: object(7)
memory usage: 338.2+ KB


## Date Format

In [26]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6181 entries, 0 to 6180
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6181 non-null   datetime64[ns]
 1   Open       6181 non-null   object        
 2   High       6181 non-null   object        
 3   Low        6181 non-null   object        
 4   Close      6181 non-null   object        
 5   Adj Close  6181 non-null   object        
 6   Volume     6181 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 338.2+ KB


## Numeric Data Format

In [27]:
numbers = df.select_dtypes(include = ["object"]).columns
numbers

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [28]:
for col in numbers:
    df[col] = df[col].str.replace("," , "")
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6181 entries, 0 to 6180
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6181 non-null   datetime64[ns]
 1   Open       6181 non-null   float64       
 2   High       6181 non-null   float64       
 3   Low        6181 non-null   float64       
 4   Close      6181 non-null   float64       
 5   Adj Close  6181 non-null   float64       
 6   Volume     5768 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 338.2 KB


In [29]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2025-04-21,3347.0,3442.3,3344.0,3430.8,3430.8,217966.0
1,2025-04-17,3345.0,3345.0,3287.8,3308.7,3308.7,1874.0
2,2025-04-16,3238.3,3334.9,3238.3,3326.6,3326.6,1874.0
3,2025-04-15,3216.0,3218.7,3214.0,3218.7,3218.7,390.0
4,2025-04-14,3215.5,3228.8,3194.5,3204.8,3204.8,263.0


In [30]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
6176,2000-09-06,274.2,274.2,274.2,274.2,274.2,
6177,2000-09-05,275.8,275.8,275.8,275.8,275.8,2.0
6178,2000-09-01,277.0,277.0,277.0,277.0,277.0,
6179,2000-08-31,274.8,278.3,274.8,278.3,278.3,
6180,2000-08-30,273.9,273.9,273.9,273.9,273.9,


In [31]:
df = df.drop(columns = ['Open', 'High', 'Low', 'Close', 'Volume'])

In [32]:
df.head()

Unnamed: 0,Date,Adj Close
0,2025-04-21,3430.8
1,2025-04-17,3308.7
2,2025-04-16,3326.6
3,2025-04-15,3218.7
4,2025-04-14,3204.8


In [33]:
df = df.set_index('Date')

In [34]:
df.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2025-04-21,3430.8
2025-04-17,3308.7
2025-04-16,3326.6
2025-04-15,3218.7
2025-04-14,3204.8


In [35]:
df = df.asfreq('b', method = 'ffill')

In [36]:
df.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2000-08-30,273.9
2000-08-31,278.3
2000-09-01,277.0
2000-09-04,275.8
2000-09-05,275.8


In [37]:
df.tail()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2025-04-15,3218.7
2025-04-16,3326.6
2025-04-17,3308.7
2025-04-18,3430.8
2025-04-21,3430.8


In [38]:
df.to_csv('gold_close2.csv')

# Save The Data In CSV File

In [17]:
df.to_csv("gold_test.csv" , index=False)