# Gold Prices Scraping

## Import Libraries

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Make The Request From Yahoo Finance and Get The Table HTML

In [2]:
url = "https://finance.yahoo.com/quote/GC%3DF/history/?period1=967608000&period2=1729276113"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
r = requests.get(url, headers=headers)

c = r.text
soup=BeautifulSoup(c, "lxml")

table = soup.find("table" , {"class":"table yf-ewueuo noDl"})

## Get The Headers of The Table

In [3]:
headers = table.find_all("th" , {"class":"yf-ewueuo"})

titles = []

for i in headers:
    titles.append(i.text.split("  ")[0])
    
df = pd.DataFrame(columns = titles)
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume


## Get The Content of The Table

In [4]:
rows_con = table.find_all("tr" , {"class":"yf-ewueuo"})
for i in rows_con[1:]:
    data = i.find_all("td" , {"class":"yf-ewueuo"})
    row = [tr.text for tr in data]
    new_row_df = pd.DataFrame([row], columns=df.columns)  # Ensure columns match
    df = pd.concat([df, new_row_df], ignore_index=True)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Oct 18, 2024",2707.8,2737.8,2707.3,2736.4,2736.4,171111
1,"Oct 17, 2024",2677.4,2691.7,2677.2,2691.0,2691.0,6
2,"Oct 16, 2024",2674.0,2674.0,2674.0,2674.0,2674.0,6
3,"Oct 15, 2024",2661.4,2661.4,2661.4,2661.4,2661.4,39
4,"Oct 14, 2024",2655.0,2655.3,2647.8,2647.8,2647.8,40


## Data Preprocessing

In [5]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Oct 18, 2024",2707.8,2737.8,2707.3,2736.4,2736.4,171111
1,"Oct 17, 2024",2677.4,2691.7,2677.2,2691.0,2691.0,6
2,"Oct 16, 2024",2674.0,2674.0,2674.0,2674.0,2674.0,6
3,"Oct 15, 2024",2661.4,2661.4,2661.4,2661.4,2661.4,39
4,"Oct 14, 2024",2655.0,2655.3,2647.8,2647.8,2647.8,40
5,"Oct 11, 2024",2638.3,2658.1,2638.2,2657.6,2657.6,12
6,"Oct 10, 2024",2602.5,2628.3,2602.5,2620.6,2620.6,320
7,"Oct 9, 2024",2603.0,2607.7,2603.0,2606.0,2606.0,152
8,"Oct 8, 2024",2639.0,2639.0,2609.3,2615.0,2615.0,687
9,"Oct 7, 2024",2648.7,2657.4,2639.0,2644.8,2644.8,284


In [6]:
df.shape

(6056, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6056 entries, 0 to 6055
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       6056 non-null   object
 1   Open       6056 non-null   object
 2   High       6056 non-null   object
 3   Low        6056 non-null   object
 4   Close      6056 non-null   object
 5   Adj Close  6056 non-null   object
 6   Volume     6056 non-null   object
dtypes: object(7)
memory usage: 331.3+ KB


## Date Format

In [8]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6056 entries, 0 to 6055
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6056 non-null   datetime64[ns]
 1   Open       6056 non-null   object        
 2   High       6056 non-null   object        
 3   Low        6056 non-null   object        
 4   Close      6056 non-null   object        
 5   Adj Close  6056 non-null   object        
 6   Volume     6056 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 331.3+ KB


## Numeric Data Format

In [9]:
numbers = df.select_dtypes(include = ["object"]).columns
numbers

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [10]:
for col in numbers:
    df[col] = df[col].str.replace("," , "")
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6056 entries, 0 to 6055
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6056 non-null   datetime64[ns]
 1   Open       6056 non-null   float64       
 2   High       6056 non-null   float64       
 3   Low        6056 non-null   float64       
 4   Close      6056 non-null   float64       
 5   Adj Close  6056 non-null   float64       
 6   Volume     5643 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 331.3 KB


In [11]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2024-10-18,2707.8,2737.8,2707.3,2736.4,2736.4,171111.0
1,2024-10-17,2677.4,2691.7,2677.2,2691.0,2691.0,6.0
2,2024-10-16,2674.0,2674.0,2674.0,2674.0,2674.0,6.0
3,2024-10-15,2661.4,2661.4,2661.4,2661.4,2661.4,39.0
4,2024-10-14,2655.0,2655.3,2647.8,2647.8,2647.8,40.0


# Save The Data In CSV File

In [12]:
df.to_csv("gold.csv" , index=False)