<a href="https://colab.research.google.com/github/mabittar/funds_pricer/blob/master/SCRAPER_cvmweb_cvm_gov_br.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install selenium
!pip install asyncio
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [39]:
import sys
import asyncio
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from typing import Union, List, Optional
from datetime import date, datetime
from dataclasses import dataclass, field
from time import sleep
from decimal import Decimal
from selenium import webdriver
from itertools import chain
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [16]:
CVM_URL = "https://cvmweb.cvm.gov.br/SWB/Sistemas/SCW/CPublica/CConsolFdo/ResultBuscaParticFdo.aspx?CNPJNome="
FUND_DETAIL_URL = "https://cvmweb.cvm.gov.br/SWB/Sistemas/SCW/CPublica/InfDiario/CPublicaInfDiario.aspx?PK_PARTIC=132922&SemFrame=" # Atenção ao pk_partic

In [17]:
@dataclass(order=True)
class TimeSeries:
  sort_index: datetime = field(init=False, repr=False)
  timestamp: datetime
  value: Decimal
  net_worth_str: str = field(repr=False)
  net_worth: float = field(init=False)
  owners: int

  def __post_init__(self):
        s = self.net_worth_str.replace('.','').replace(',','.')
        parsed_num = float(s)
        self.net_worth = parsed_num
        self.sort_index = self.timestamp

@dataclass
class FundTS:
  doc_number: str
  fund_pk: Optional[str] = None
  name: Optional[str] = None
  released_on: Optional[date] = None
  ts: Optional[List[TimeSeries]] = None

In [18]:
def filter_limit_date(available_date_list: list, from_date: Union[date, None] = None, end_date: Union[date, None] = None) -> List[list]:
  available_date_datetime = [(index, date_str) for index, date_str in enumerate(available_date_list)]
  if from_date is not None:
    available_date_datetime = [(index, date_str) for index, date_str in available_date_datetime if datetime.strptime(f'01/{date_str}', "%d/%m/%Y").date() >= from_date ]
  if end_date is not None:
    available_date_datetime = [(index, date_str) for index, date_str in available_date_datetime if datetime.strptime(f'01/{date_str}', "%d/%m/%Y").date() <= end_date ]
  return available_date_datetime


date_list = ['10/2022', '09/2022', '08/2022', '07/2022', '06/2022']
start_date = datetime(year=2022, month=7,day=1).date()
end_date = datetime(year=2022, month=9,day=1).date()

print(filter_limit_date(date_list,start_date, end_date ))

[(1, '09/2022'), (2, '08/2022'), (3, '07/2022')]


In [19]:
async def get_fund_pk(url_str: str) -> str:
  find_str = "?PK_PARTIC="
  pk_postion = url_str.find(find_str)
  url_str = url_str[pk_postion + len(find_str):].split("&")[0]
  return url_str

In [42]:
async def parse_data(wd, i, month_year):
  print("Getting results for %s" % month_year)
  i += 1
  wd.find_element(By.XPATH, f"//*[@id='ddComptc']/option[{i}]").click()
  #this will click the option which index is defined by positionn
  await asyncio.sleep(3)
  WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dgDocDiario"]')))
  rows = wd.find_elements(By.XPATH, '//*[@id="dgDocDiario"]/tbody/tr')
  for index, row in enumerate(rows[1:]):
    daily_data = [field.replace(" ", "") for field in row.text.split(" ")]
    value = daily_data[1]
    if value != "":
      try:
        date_field = daily_data[0]
        net_worth = daily_data[4]
        owner_number = daily_data[6]
        datetime_stamp =  datetime.strptime(f'{date_field}/{month_year}', "%d/%m/%Y")
        ts = TimeSeries(datetime_stamp, Decimal(value.replace(",",".")), net_worth, int(owner_number))
        return ts
      except ValueError as e:
        print(e)
        print(date_field, month_year)

In [23]:
def str_2_date(str_date):
  return datetime.strptime(f'01/{str_date}', "%d/%m/%Y").date()

In [59]:
async def parse_table(wd: webdriver, fund_daily_link:str, from_date: Union[date, str] = None, end_date: Union[date, str] = None ) -> List[list]:
    wd.get(fund_daily_link)
    table = wd.find_element(By.ID, 'TABLE1')
    selectors = wd.find_element(By.XPATH,'//*[@id="ddComptc"]')
    selectors_list = selectors.text.split("\n")
    selectors_list = [i.replace(' ', "") for i in selectors_list[:-1]] # lembrar excluiur -1 pois é vazio ou validar o se o campo pode ser lido como mm/YYYY
    selectors_filtered = filter_limit_date(selectors_list, from_date, end_date)
    select = wd.find_element(By.XPATH, '//*[@id="ddComptc"]')
    wd.execute_script("showDropdown = function (element) {var event; event = document.createEvent('MouseEvents'); event.initMouseEvent('mousedown', true, true, window); element.dispatchEvent(event); }; showDropdown(arguments[0]);",select)
    # open dropdown options
    ts_list = []
    loop = asyncio.get_event_loop()
    async with asyncio.Semaphore(10):
      for i, month_year in selectors_filtered:
        ts_list.append(
          asyncio.create_task(parse_data(wd, i, month_year))
      )
    await asyncio.gather(*ts_list)
    return ts_list

In [61]:
async def get_fund_data(document_number: str, from_date: Union[date, None] = None, end_date: Union[date, None] = None) -> FundTS:
  if from_date is not None:
    from_date = from_date if isinstance(from_date, date) else str_2_date(from_date)
  if end_date is not None:
    end_date = end_date if isinstance(end_date, date) else str_2_date(end_date)
  # Detalhes do fundo
  # TODO: validar CNPJ
  fund_ts = FundTS(doc_number=document_number)
  with webdriver.Chrome('chromedriver',options=chrome_options) as wd:
    wd.get(CVM_URL + document_number)  # url = CVM + CNPJ
    WebDriverWait(wd, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'HRefPreto')))
    wd.execute_script("__doPostBack('ddlFundos$_ctl0$lnkbtn1','')")
    pk_url = wd.current_url
    fund_ts.fund_pk = await get_fund_pk(pk_url)
    fund_name_elem = wd.find_element(By.XPATH, '//*[@id="lbNmDenomSocial"]')
    fund_ts.name = fund_name_elem.text
    start_at = wd.find_element(By.XPATH, '//*[@id="lbInfAdc1"]')
    fund_ts.released_on = datetime.strptime(start_at.text, '%d/%m/%Y').date()
    # Lâmina de cotas diárias
    fund_daily = wd.find_element(By.XPATH, '//*[@id="Hyperlink2"]')
    fund_daily_link = fund_daily.get_attribute('href')
    # parser da tabela
    result: list = await parse_table(wd, fund_daily_link, from_date, end_date)
    time_series = [ts.result() for ts in result]
    fund_ts.ts = sorted(time_series)
  return fund_ts

In [62]:
from time import perf_counter
async def main():
    return await asyncio.gather(get_fund_data("18993924000100", "01/2019"))

s = perf_counter()
result = await main()
elapsed = perf_counter() - s
print(f"Script executed in {elapsed:0.2f} seconds.")

Getting results for 10/2022
Getting results for 09/2022
Getting results for 08/2022
Getting results for 07/2022
Getting results for 06/2022
Getting results for 05/2022
Getting results for 04/2022
Getting results for 03/2022
Getting results for 02/2022
Getting results for 01/2022
Getting results for 12/2021
Getting results for 11/2021
Getting results for 10/2021
Getting results for 09/2021
Getting results for 08/2021
Getting results for 07/2021
Getting results for 06/2021
Getting results for 05/2021
Getting results for 04/2021
Getting results for 03/2021
Getting results for 02/2021
Getting results for 01/2021
Getting results for 12/2020
Getting results for 11/2020
Getting results for 10/2020
Getting results for 09/2020
Getting results for 08/2020
Getting results for 07/2020
Getting results for 06/2020
Getting results for 05/2020
Getting results for 04/2020
Getting results for 03/2020
Getting results for 02/2020
Getting results for 01/2020
Getting results for 12/2019
Getting results for 

In [63]:
result[0]

FundTS(doc_number='18993924000100', fund_pk='132922', name='XP INVESTOR LONG BIASED FUNDO DE INVESTIMENTO EM COTAS DE FUNDOS DE INVESTIMENTO MULTIMERCADO', released_on=datetime.date(2013, 10, 14), ts=[TimeSeries(timestamp=datetime.datetime(2019, 1, 2, 0, 0), value=Decimal('3.38676374'), net_worth=147896232.09, owners=1982), TimeSeries(timestamp=datetime.datetime(2019, 2, 2, 0, 0), value=Decimal('3.38676374'), net_worth=147896232.09, owners=1982), TimeSeries(timestamp=datetime.datetime(2019, 3, 2, 0, 0), value=Decimal('3.38676374'), net_worth=147896232.09, owners=1982), TimeSeries(timestamp=datetime.datetime(2019, 4, 2, 0, 0), value=Decimal('3.38676374'), net_worth=147896232.09, owners=1982), TimeSeries(timestamp=datetime.datetime(2019, 5, 2, 0, 0), value=Decimal('3.38676374'), net_worth=147896232.09, owners=1982), TimeSeries(timestamp=datetime.datetime(2019, 6, 2, 0, 0), value=Decimal('3.38676374'), net_worth=147896232.09, owners=1982), TimeSeries(timestamp=datetime.datetime(2019, 7, 2