# Webscraping B3

# Web Scraping B3

This Jupyter notebook demonstrates a web scraping project aimed at extracting financial data from the B3 (Brasil, Bolsa, Balcão) website using Selenium and BeautifulSoup. The notebook includes steps for data extraction, processing, and conversion of scraped dates into a suitable format for analysis.

## Features

- **Web Scraping**: Utilizes Selenium for navigating the B3 website and extracting relevant financial data.
- **Data Processing**: Employs Pandas for data manipulation and BeautifulSoup for parsing HTML.
- **Date Conversion**: Converts the scraped date format into `pd.Timestamp` for better handling and analysis.
- **Data Analysis**: Includes basic data analysis and visualization using Matplotlib and other relevant libraries.

## Prerequisites

Before you begin, ensure you have met the following requirements:

- Python 3.7 or higher
- Jupyter Notebook
- Selenium
- BeautifulSoup4
- Pandas
- Matplotlib
- ChromeDriver (compatible with your Chro3.git
   cd webscraping-b3


In [1]:
!pip install selenium 
!pip install webdriver-manager



In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service    #helps managing drives
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from datetime import datetime

In [3]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) 
# fix driver issues. Gets the latest browser release

## Scraping Settings

In [4]:
date = "27/05/2024"


In [5]:
product = 'DI1'

In [6]:
url = f'''
https://www2.bmf.com.br/pages/portal/bmfbovespa/boletim1/SistemaPregao1.asp?pagetype=pop&caminho=Resumo%20
Estat%EDstico%20-%20Sistema%20Preg%E3o&Data={date}&Mercadoria={product}
'''

In [7]:
from selenium.webdriver.chrome.options import Options

In [8]:
# Runs scrape in background
options = Options()
options.headless = True

In [9]:
driver.get(url)

driver.implicitly_wait(3)

'''
implicity_wait method is an efficient way of just waiting the necessary time to load element and run 
'''

driver.maximize_window() #maximizes window

In [10]:
table_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[3]/table
'''

index_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[1]/table
'''

In [11]:
element = driver.find_element("xpath", table_site)

#can be changed as needed such as  ID, class_, XPATH ou name (vc so vai usar um dos 4)

element_index = driver.find_element("xpath", index_site)

table_html = element.get_attribute('outerHTML')
index_html = element_index.get_attribute('outerHTML')

table = pd.read_html(table_html)[0]
indices = pd.read_html(index_html)[0]


  table = pd.read_html(table_html)[0]
  indices = pd.read_html(index_html)[0]


In [12]:
driver.quit()

In [13]:
table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,AJUSTE ANTER. (3),AJUSTE CORRIG. (4),PREÇO ABERT.,PREÇO MÍN.,PREÇO MÁX.,PREÇO MÉD.,ÚLT. PREÇO,AJUSTE,VAR. PTOS.,ÚLT. OF. COMPRA,ÚLT. OF. VENDA
1,"99.843,15","99.843,15",10398,10394,10404,10396,10400,"99.843,13","0,02-",10394,10400
2,"99.063,11","99.063,11",10386,10384,10392,10386,10386,"99.063,34","0,23+",10384,10386
3,"98.175,67","98.175,67",10372,10370,10372,10370,10370,"98.176,59","0,92+",10368,10372
4,"97.334,88","97.334,88",10375,10350,10375,10367,10360,"97.338,16","3,28+",10350,10360
5,"96.537,60","96.537,60",10375,10345,10380,10360,10355,"96.543,44","5,84+",10350,10355
6,"95.667,82","95.667,82",10380,10350,10380,10359,10355,"95.676,32","8,50+",10355,10370
7,"94.950,79","94.950,79",10400,10400,10400,10400,10400,"94.962,50","11,71+",10365,10380
8,"94.169,79","94.169,79",10405,10360,10420,10384,10375,"94.184,82","15,03+",10370,10375
9,"93.333,32","93.333,32",10450,10425,10450,10434,10425,"93.355,79","22,47+",10400,0000


In [14]:
table.columns = table.loc[0]

table = table['ÚLT. PREÇO']

table = table.drop(0, axis = 0)

indices.columns = indices.loc[0]

indice_di = indices['VENCTO']

indices = indices.drop(0, axis = 0)

table.index = indices['VENCTO']

table = table.astype(int)

table = table[table != 0]

table = table/1000

print(table)

VENCTO
M24    10.400
N24    10.386
Q24    10.370
U24    10.360
V24    10.355
X24    10.355
Z24    10.400
F25    10.375
G25    10.425
H25    10.445
J25    10.450
K25    10.485
N25    10.535
V25    10.645
F26    10.720
J26    10.810
N26    10.895
V26    10.970
F27    11.035
J27    11.115
N27    11.185
V27    11.260
F28    11.320
J28    11.370
N28    11.430
V28    11.470
F29    11.495
J29    11.580
N29    11.610
F30    11.630
F31    11.690
F32    11.710
F33    11.760
F35    11.780
Name: ÚLT. PREÇO, dtype: float64


In [15]:
driver.quit()

# Data preprocessing

In [16]:
legenda = pd.Series(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
                    index = ['F', 'G', 'H', 'J', 'K', 'M', 'N', 'Q', 'U', 'V', 'X', 'Z'])

lista_datas = []

In [17]:
for indice in table.index:

    letra = indice[0]
    
    legenda["F"]
    ano = indice[1:3]

    mes = legenda[letra]

    data = f"{mes}-{ano}"

    data = datetime.strptime(data, "%b-%y")

    lista_datas.append(data)


table.index = lista_datas  

table

2024-06-01    10.400
2024-07-01    10.386
2024-08-01    10.370
2024-09-01    10.360
2024-10-01    10.355
2024-11-01    10.355
2024-12-01    10.400
2025-01-01    10.375
2025-02-01    10.425
2025-03-01    10.445
2025-04-01    10.450
2025-05-01    10.485
2025-07-01    10.535
2025-10-01    10.645
2026-01-01    10.720
2026-04-01    10.810
2026-07-01    10.895
2026-10-01    10.970
2027-01-01    11.035
2027-04-01    11.115
2027-07-01    11.185
2027-10-01    11.260
2028-01-01    11.320
2028-04-01    11.370
2028-07-01    11.430
2028-10-01    11.470
2029-01-01    11.495
2029-04-01    11.580
2029-07-01    11.610
2030-01-01    11.630
2031-01-01    11.690
2032-01-01    11.710
2033-01-01    11.760
2035-01-01    11.780
Name: ÚLT. PREÇO, dtype: float64

## Mini-Indice - WIN

In [28]:
product = 'WIN'

In [29]:
url = f'''
https://www2.bmf.com.br/pages/portal/bmfbovespa/boletim1/SistemaPregao1.asp?pagetype=pop&caminho=Resumo%20
Estat%EDstico%20-%20Sistema%20Preg%E3o&Data={date}&Mercadoria={product}
'''

In [30]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) 

driver.get(url)

driver.implicitly_wait(3)


driver.maximize_window() 

table_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[3]/table
'''

index_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[1]/table
'''

element = driver.find_element("xpath", table_site)

#can be changed as needed such as  ID, class_, XPATH ou name (vc so vai usar um dos 4)

element_index = driver.find_element("xpath", index_site)

table_html = element.get_attribute('outerHTML')
index_html = element_index.get_attribute('outerHTML')

table = pd.read_html(table_html)[0]
indices = pd.read_html(index_html)[0]

driver.quit()

  table = pd.read_html(table_html)[0]
  indices = pd.read_html(index_html)[0]


In [31]:
table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,PREÇO ABERT.,PREÇO MÍN.,PREÇO MÁX.,PREÇO MÉD.,ÚLT. PREÇO,AJUSTE,OSCIL.,VAR. PTOS.,ÚLT. OF. COMPRA,ÚLT. OF. VENDA
1,124.810,124.550,125.085,124.845,125.000,124.959,"0,1+",105+,124.990,125.000
2,126.975,126.570,127.070,126.799,127.030,126.963,"0,1+",104+,126.905,127.035
3,128.765,128.765,129.000,128.788,129.000,128.869,"0,1+",99+,128.500,130.530
4,129.780,129.700,129.780,129.740,129.700,130.792,"0,7-",91+,0,131.520
5,0,0,0,0,0,132.548,00,75+,0,0
6,0,0,0,0,0,134.581,00,40+,0,0
7,0,0,0,0,0,136.633,00,2+,0,0
8,0,0,0,0,0,138.598,00,42-,0,0
9,0,0,0,0,0,140.912,00,95-,0,0


In [32]:
table.columns = table.loc[0]

table = table['AJUSTE']

table = table.drop(0, axis = 0)

indices.columns = indices.loc[0]

indice_di = indices['VENCTO']

indices = indices.drop(0, axis = 0)

table.index = indices['VENCTO']

table = table[table != "0"]

print(table)

VENCTO
M24    124.959
Q24    126.963
V24    128.869
Z24    130.792
G25    132.548
J25    134.581
M25    136.633
Q25    138.598
V25    140.912
Z25    143.228
G26    145.473
J26    147.658
M26    150.062
Name: AJUSTE, dtype: object


In [None]:
tabela.columns = tabela.loc[0]

tabela = tabela['AJUSTE']

tabela = tabela.drop(0, axis = 0)

indice.columns = indice.loc[0]

indice_di = indice['VENCTO']

indice = indice.drop(0, axis = 0)

tabela.index = indice['VENCTO']

tabela = tabela[tabela != "0"]

print(tabela)

# Mini-Dolar WDO

In [38]:
product = 'WDO'

In [39]:
url = f'''
https://www2.bmf.com.br/pages/portal/bmfbovespa/boletim1/SistemaPregao1.asp?pagetype=pop&caminho=Resumo%20
Estat%EDstico%20-%20Sistema%20Preg%E3o&Data={date}&Mercadoria={product}
'''

In [40]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) 

driver.get(url)

driver.implicitly_wait(3)


driver.maximize_window() 

table_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[3]/table
'''

index_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[1]/table
'''

element = driver.find_element("xpath", table_site)

#can be changed as needed such as  ID, class_, XPATH ou name (vc so vai usar um dos 4)

element_index = driver.find_element("xpath", index_site)

table_html = element.get_attribute('outerHTML')
index_html = element_index.get_attribute('outerHTML')

table = pd.read_html(table_html)[0]
indices = pd.read_html(index_html)[0]

driver.quit()

  table = pd.read_html(table_html)[0]
  indices = pd.read_html(index_html)[0]


In [41]:
table

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,PREÇO ABERT.,PREÇO MÍN.,PREÇO MÁX.,PREÇO MÉD.,ÚLT. PREÇO,AJUSTE,VAR. PTOS.,ÚLT. OF. COMPRA,ÚLT. OF. VENDA
1,"5.160,000","5.156,000","5.185,000","5.169,676","5.173,500","5.173,7820","2,4110+","5.172,500","5.173,500"
2,"5.179,000","5.171,500","5.199,000","5.183,385","5.187,500","5.188,6810","2,2850+","5.186,500","5.188,500"
3,"5.199,500","5.190,000","5.209,000","5.202,073","5.203,500","5.206,3190","2,2910+","5.203,500","5.207,000"
4,"5.198,000","5.195,500","5.232,500","5.210,535","5.209,500","5.222,0560","2,0030+","5.206,000","5.223,000"
5,0000,0000,0000,0000,0000,"5.236,6210","1,7730+","5.217,500","5.286,500"
6,0000,0000,0000,0000,0000,"5.253,4590","1,7700+",0000,0000
7,0000,0000,0000,0000,0000,"5.265,8470","1,5440+",0000,0000
8,0000,0000,0000,0000,0000,"5.279,0940","1,3240+",0000,0000
9,0000,0000,0000,0000,0000,"5.295,8150","0,5000+",0000,0000


In [42]:
table.columns = table.loc[0]

table = table['AJUSTE']

table = table.drop(0, axis = 0)

indices.columns = indices.loc[0]

indice_di = indices['VENCTO']

indices = indices.drop(0, axis = 0)

table.index = indices['VENCTO']

table = table[table != "0"]

print(table)

VENCTO
M24    5.173,7820
N24    5.188,6810
Q24    5.206,3190
U24    5.222,0560
V24    5.236,6210
X24    5.253,4590
Z24    5.265,8470
F25    5.279,0940
G25    5.295,8150
H25    5.311,8480
J25    5.327,5370
K25    5.340,7940
N25    5.380,6660
V25    5.451,2840
F26    5.520,0670
J26    5.595,6270
Name: AJUSTE, dtype: object


# EURO

In [45]:
product = 'EUR'
url = f'''
https://www2.bmf.com.br/pages/portal/bmfbovespa/boletim1/SistemaPregao1.asp?pagetype=pop&caminho=Resumo%20
Estat%EDstico%20-%20Sistema%20Preg%E3o&Data={date}&Mercadoria={product}
'''

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) 

driver.get(url)

driver.implicitly_wait(3)


driver.maximize_window() 

table_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[3]/table
'''

index_site = '''
/html/body/div/div[2]/form[1]/table[3]/tbody/tr[3]/td[1]/table
'''

element = driver.find_element("xpath", table_site)

#can be changed as needed such as  ID, class_, XPATH ou name (vc so vai usar um dos 4)

element_index = driver.find_element("xpath", index_site)

table_html = element.get_attribute('outerHTML')
index_html = element_index.get_attribute('outerHTML')

table = pd.read_html(table_html)[0]
indices = pd.read_html(index_html)[0]

driver.quit()

  table = pd.read_html(table_html)[0]
  indices = pd.read_html(index_html)[0]


In [46]:
table

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,PREÇO ABERT.,PREÇO MÍN.,PREÇO MÁX.,PREÇO MÉD.,ÚLT. PREÇO,AJUSTE,VAR. PTOS.,ÚLT. OF. COMPRA,ÚLT. OF. VENDA
1,0000,0000,0000,0000,0000,"5.617,7960","6,3880+",0000,0000
2,0000,0000,0000,0000,0000,"5.640,2570","5,1390+",0000,0000
3,0000,0000,0000,0000,0000,"5.667,9270","5,0080+",0000,0000
4,0000,0000,0000,0000,0000,"5.693,5710","5,0970+",0000,0000
5,0000,0000,0000,0000,0000,"5.717,8930","5,0510+",0000,0000


In [47]:
table.columns = table.loc[0]

table = table['AJUSTE']

table = table.drop(0, axis = 0)

indices.columns = indices.loc[0]

indice_di = indices['VENCTO']

indices = indices.drop(0, axis = 0)

table.index = indices['VENCTO']

table = table[table != "0"]

print(table)

VENCTO
M24    5.617,7960
N24    5.640,2570
Q24    5.667,9270
U24    5.693,5710
V24    5.717,8930
Name: AJUSTE, dtype: object
