<a href="https://colab.research.google.com/github/maberf/colabs/blob/main/Fundamentus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
"""
Scraper FundamentuS
- Rodar para lista de tickers (ex.: ["PETR4"])
- Retorna/mostra um DataFrame pandas (uma linha por ticker)
- grava em Google Sheets
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep
from IPython.display import display
from google.colab import drive
from google.colab import auth
from google.auth import default
import gspread

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

In [18]:
# B3 STOCKS FUNDAMENTUS ROUTINE
#
tickers = ['^BVSP','ABEV3.SA','BBAS3.SA','BBDC4.SA','BBSE3.SA','CMIN3.SA','CPFE3.SA','CPLE6.SA','ELET6.SA','EMBR3.SA','ITUB4.SA','IVVB11.SA','JHSF3.SA','KLBN11.SA','MBRF3.SA','LEVE3.SA','LREN3.SA','PETR4.SA','POMO4.SA','RECV3.SA','SBSP3.SA','TGMA3.SA','VALE3.SA','VIVA3.SA','VIVT3.SA','WEGE3.SA']
#

In [19]:
# Remove ".SA" from stock dataframe tickers
tickersadjusted = [nome.replace('.SA', '') for nome in tickers]
# Remove "^BVSP" ticker
if "^BVSP" in tickersadjusted:
    tickersadjusted.remove("^BVSP")

In [20]:
# tickers = ["PETR4","VALE3"]
tickers = tickersadjusted
print (tickers)

['ABEV3', 'BBAS3', 'BBDC4', 'BBSE3', 'CMIN3', 'CPFE3', 'CPLE6', 'ELET6', 'EMBR3', 'ITUB4', 'IVVB11', 'JHSF3', 'KLBN11', 'MBRF3', 'LEVE3', 'LREN3', 'PETR4', 'POMO4', 'RECV3', 'SBSP3', 'TGMA3', 'VALE3', 'VIVA3', 'VIVT3', 'WEGE3']


In [21]:
# ---------------- utilitários ----------------

def parse_brazil_number(s):
    """Converte textos numéricos BR para float ou retorna string.
       Também lida com percentuais (retorna float representando o número, ex: '41,7%' -> 41.7)."""
    if s is None:
        return None
    if isinstance(s, (int, float)):
        return float(s)
    st = str(s).strip()
    if st == "" or st == "-" or st.lower() in ("n/a","na"):
        return None
    st = st.replace("\xa0", " ").strip()
    # detectar percentual
    is_pct = False
    if "%" in st:
        is_pct = True
        st = st.replace("%", "")
    # limpar "R$" e espaços
    st = st.replace("R$", "").strip()
    # se contém letras longas (ex: "PN") -> não é número
    if re.search(r'[A-Za-z]{2,}', st) and not re.search(r'\d', st):
        return st
    # formatos:
    try:
        if "." in st and "," in st:
            num = float(st.replace(".", "").replace(",", "."))
        elif "," in st and "." not in st:
            num = float(st.replace(",", "."))
        elif "." in st and "," not in st:
            # likely thousand sep -> remove dots
            num = float(st.replace(".", ""))
        else:
            num = float(st)
        return float(num) if not is_pct else float(num)
    except:
        return st

def extract_label_value_pairs_from_tr(tr):
    """Emparelha células adjacentes num <tr> retornando lista de (label, value)."""
    cells = [c.get_text(" ", strip=True) for c in tr.find_all(['td','th'])]
    pairs = []
    i = 0
    n = len(cells)
    while i < n:
        label = cells[i].strip()
        if label == "":
            i += 1
            continue
        j = i + 1
        while j < n and cells[j].strip() == "":
            j += 1
        if j >= n:
            break
        value = cells[j].strip()
        label_norm = re.sub(r'[:\s]+$', '', label)
        pairs.append((label_norm, value))
        i = j + 1
    return pairs

# ---------------- mapping de labels ----------------
# ordem importante: padrões específicos (margem ebit) antes de padrões genéricos como 'ebit'
LABEL_MAP = [
    (r'^(papel|ticker)$', 'ticker'),
    (r'cotac', 'cotacao'),
    (r'data.*ult', 'data_ultima_cotacao'),
    (r'min 52', 'min_52_sem'),
    (r'max 52', 'max_52_sem'),
    (r'vol .*2m|vol .*med', 'vol_med_2m'),
    (r'^setor$', 'setor'),
    (r'^subsetor$', 'subsetor'),
    (r'valor de mercado|valor mercado', 'valor_mercado'),
    (r'valor da firma|valor firma', 'valor_firma'),
    (r'n(ro|º)|n(ro|º).*a[cç]oes|número.*a[cç]oes|nro.*a[cç]oes', 'nro_acoes'),
    (r'p\/l|p\.?\/l', 'pl'),
    (r'lpa', 'lpa'),
    (r'p\/vp|p\.?\/vp', 'p_vp'),
    (r'vpa', 'vpa'),
    (r'marg.*ebit', 'marg_ebit'),            # específico: Marg. EBIT (percentual)
    (r'p\/ebit|p\.?\/ebit', 'p_ebit'),
    (r'marg.*brut|margem.*bruta', 'marg_bruta'),
    (r'psr', 'psr'),
    (r'^ebit$', 'ebit'),                     # específico: EBIT valor monetário
    (r'p_?ativos|p\/ativos', 'p_ativos'),
    (r'marg.*liquida|margem.*liquida', 'marg_liquida'),
    (r'p[_\s]?cap[_\s]?giro|p cap giro', 'p_cap_giro'),
    (r'p[_\s]?ativ[_\s]?circ[_\s]?liq', 'p_ativ_circ_liq'),
    (r'roic', 'roic'),
    (r'div.*yield|dividend.*yield', 'dividend_yield'),
    (r'roe', 'roe'),
    (r'ev.*ebitda|ev / ebitda', 'ev_ebitda'),
    (r'liquidez corr|liquidez_corr|liquidez', 'liquidez_corr'),
    (r'ev.*ebit|ev / ebit', 'ev_ebit'),
    (r'div br.*patrim|div br patrim', 'div_br_patrim'),
    (r'cres.*rec|cres_rec_5a', 'cres_rec_5a'),
    (r'giro.*ativo|giro_ativos', 'giro_ativos'),
    (r'^ativo$', 'ativo'),
    (r'disponibilidades', 'disponibilidades'),
    (r'ativo circulante|ativo_circulante', 'ativo_circulante'),
    (r'div.*bruta', 'div_bruta'),
    (r'div.*l[ií]quida', 'div_liquida'),
    (r'patrim(o|ô)nio', 'patrimonio_liquido'),
    (r'receita liquida', 'receita_liquida_12m'),
    (r'lucro l[ií]quido', 'lucro_liquido_12m'),
    (r'oscila', 'oscilacoes'),
    (r'empresa', 'empresa'),
    (r'tipo', 'tipo'),
    (r'subsetor', 'subsetor'),
    (r'ultimos 12 meses|ultimos_12_meses', 'ultimos_12_meses'),
]

def normalize_label(label):
    lab = label.lower().strip()
    trans = str.maketrans("áàãâéêíóôõúüç","aaaaeeiooouuc")
    lab_no = lab.translate(trans)
    for pat, std in LABEL_MAP:
        if re.search(pat, lab_no):
            return std
    s = re.sub(r'[:\.\-\/\(\)]', ' ', lab_no)
    s = re.sub(r'[^0-9a-z\s]', '', s)
    s = re.sub(r'\s+', '_', s).strip('_')
    return s if s else label.lower()

# ---------------- parse da página ----------------

def parse_fundamentus_page(html_text):
    soup = BeautifulSoup(html_text, "html.parser")
    raw = {}
    # coletar pares originais
    for table in soup.find_all('table'):
        for tr in table.find_all('tr'):
            pairs = extract_label_value_pairs_from_tr(tr)
            for label, value in pairs:
                if not label:
                    continue
                # priorizar primeiro não vazio
                if label in raw:
                    if raw[label] in (None, "", "-") and value:
                        raw[label] = value
                else:
                    raw[label] = value

    # garantir captura explícita de 'Setor' se existir (usa label exato)
    # fazer busca em raw por chaves que são exatamente 'Setor' (caso-insens)
    sector_val = None
    for k in list(raw.keys()):
        if k.strip().lower() == 'setor':
            sector_val = raw[k]
            break
    # normalização
    normalized = {}
    for k, v in raw.items():
        std = normalize_label(k)
        parsed = parse_brazil_number(v) if std not in ('page_title','headline') else v
        normalized[std] = parsed

    # se 'setor' foi detectado explicitamente em raw, sobrescrever a normalização com seu valor
    if sector_val is not None:
        normalized['setor'] = parse_brazil_number(sector_val)

    # tratar Marg. EBIT e EBIT especificamente se apareceram como textos diferentes
    # procurar em raw label que contenha 'marg' e 'ebit' (caso exista)
    for k in raw.keys():
        kl = k.lower()
        if 'marg' in kl and 'ebit' in kl:
            normalized['marg_ebit'] = parse_brazil_number(raw[k])
        if kl.strip() == 'ebit' or re.fullmatch(r'ebit', kl, flags=re.I):
            normalized['ebit'] = parse_brazil_number(raw[k])

    return normalized

# ---------------- montar DataFrame e pós-processar ----------------

def get_many_tickers_fundamentus_df(tickers, pause=0.6):
    session = requests.Session()
    session.headers.update(HEADERS)
    rows = []
    cols_union = set()

    for tic in tickers:
        url = f"https://www.fundamentus.com.br/detalhes.php?papel={tic.upper()}"
        r = session.get(url, timeout=15)
        r.raise_for_status()
        info = parse_fundamentus_page(r.text)
        info['ticker'] = tic.upper()
        rows.append(info)
        cols_union.update(info.keys())
        sleep(pause)

    cols = ['ticker'] + sorted([c for c in cols_union if c != 'ticker'])
    df = pd.DataFrame(rows, columns=cols)

    # ----- excluir colunas solicitadas (inclui page_title) -----
    to_drop_tokens = {'empresa','oscilacoes','page_title','pagetitle','papel','dia','mes','30_dias','30dias','tipo','subsetor','ultimos_12_meses'}
    drop_cols = []
    for c in df.columns:
        cl = c.lower()
        for t in to_drop_tokens:
            if t in cl:
                drop_cols.append(c)
                break
    # remover colunas de anos >= 2020
    for year in range(2020, 2100):
        y = str(year)
        for c in df.columns:
            if y in c:
                drop_cols.append(c)
    # remover colunas relacionadas a "últimos 3 meses" se existirem (tokens comuns)
    for c in df.columns:
        if re.search(r'3\s*mes|ult.*3', c, flags=re.I):
            drop_cols.append(c)
    drop_cols = sorted(set(drop_cols))
    df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

        # ----- Conversão para milhões (mantendo decimais em indicadores) -----
    indicators = {
        'pl','p_vp','p_ebit','psr','dividend_yield','roe','roic','lpa','vpa',
        'p_ativos','p_cap_giro','p_ativ_circ_liq','ev_ebitda','ev_ebit',
        'liquidez_corr','div_br_patrim','cres_rec_5a','giro_ativos'
    }

    for col in df.columns:
        # tratar nro_acoes: inteiro completo
        if col == 'nro_acoes':
            df[col] = pd.to_numeric(df[col], errors='coerce').apply(lambda x: int(x) if pd.notna(x) else x)
            df[col] = df[col].astype(object)
            continue

        ser_num = pd.to_numeric(df[col], errors='coerce')
        if ser_num.notna().any():
            if col in indicators:
                # manter decimais; especial: p_ativ_circ_liq com 2 casas decimais
                if col == 'p_ativ_circ_liq':
                    df[col] = ser_num.apply(lambda x: round(float(x), 2) if pd.notna(x) else x)
                else:
                    df[col] = ser_num.apply(lambda x: float(x) if pd.notna(x) else x)
            elif col == 'ebit':
                # EBIT em milhões, inteiro, sem decimais e sem notação científica
                df[col] = ser_num.apply(
                    lambda x: int(round(x / 1_000_000)) if pd.notna(x) and abs(x) >= 1_000 else (
                        int(x) if pd.notna(x) else x
                    )
                )
                df[col] = df[col].astype(object)
            else:
                # converter grandes valores para milhões (arredondar)
                def conv(v):
                    if pd.isna(v):
                        return v
                    try:
                        if abs(v) >= 1_000_000:
                            return int(round(v / 1_000_000))  # convertido para milhões, sem decimais
                        if float(v).is_integer():
                            return int(v)
                        return float(v)
                    except:
                        return v
                df[col] = ser_num.apply(conv)
                df[col] = df[col].astype(object)
        else:
            df[col] = df[col].astype(object)

    # ----- Garantir setor correto: se 'setor' vazio e 'subsetor' existir, usar subsetor as fallback -----
    if 'setor' not in df.columns and 'subsetor' in df.columns:
        df['setor'] = df['subsetor']
    # prefer explicit 'setor' value if present; we already mapped explicit earlier

    # ----- Reordenar colunas (best-effort) -----
    desired_order = [
        "ticker","cotacao","data_ultima_cotacao","min_52_sem","max_52_sem","vol_med_2m",
        "setor","valor_mercado","valor_firma","nro_acoes","pl","lpa","p_vp","vpa","p_ebit",
        "marg_bruta","psr","marg_ebit","p_ativos","marg_liquida","p_cap_giro","p_ativ_circ_liq",
        "roic","dividend_yield","roe","ev_ebitda","liquidez_corr","ev_ebit","div_br_patrim",
        "cres_rec_5a","giro_ativos","ativo","disponibilidades","ativo_circulante","div_bruta",
        "div_liquida","patrimonio_liquido","receita_liquida_12m","ebit","lucro_liquido_12m"
    ]

    final_cols = []
    existing = list(df.columns)
    used = set()
    for want in desired_order:
        match = None
        if want in existing:
            match = want
        else:
            # substring match
            for c in existing:
                if c in used:
                    continue
                cl = c.lower()
                if want.replace('_',' ') in cl or all(tok in cl for tok in want.split('_') if tok):
                    match = c
                    break
        if match:
            final_cols.append(match)
            used.add(match)
    # append remaining columns
    for c in existing:
        if c not in used:
            final_cols.append(c)
    df = df.reindex(columns=final_cols)

    # ----- Exibição: formatar colunas de indicadores com decimais adequados -----
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 400)
    # formatting: keep floats showing meaningful decimals
    def fmt(x):
        if pd.isna(x):
            return ""
        if isinstance(x, int):
            return f"{x:,}"
        if isinstance(x, float):
            # itens que are indicators keep up to 2 decimals except when integer-like
            return f"{x:,.2f}".rstrip('0').rstrip('.')
        return str(x)
    # We will not convert df values to strings globally; display will show numeric types.
    # But ensure p_ativ_circ_liq is float with 2 decimals (already rounded above)

    return df

# ---------------- Execução exemplo ----------------

if __name__ == "__main__":
    # tickers = ["PETR4","VALE3"]
    df = get_many_tickers_fundamentus_df(tickers, pause=0.6)
    display(df)
    # print("\nDataFrame (uma linha por ticker):")
    # print(df.to_string(index=False))


Unnamed: 0,ticker,cotacao,data_ultima_cotacao,min_52_sem,max_52_sem,vol_med_2m,setor,valor_mercado,valor_firma,nro_acoes,pl,lpa,p_vp,vpa,p_ebit,marg_bruta,psr,marg_ebit,p_ativos,marg_liquida,p_cap_giro,p_ativ_circ_liq,roic,dividend_yield,roe,ev_ebitda,liquidez_corr,ev_ebit,div_br_patrim,cres_rec_5a,giro_ativos,ativo,disponibilidades,ativo_circulante,div_bruta,div_liquida,receita_liquida_12m,ebit,lucro_liquido_12m,cart_de_credito,depositos,ebit_ativo,patrim_liq,rec_servicos,result_int_financ,ult_balanco_processado
0,ABEV3,12.71,31/10/2025,10.52,14.29,359.0,Bebidas,200330.0,183384.0,15761600000.0,12.49,1.02,2.18,5.84,9.68,51.8,2.21,22.9,1.42,18.2,37.35,-32.91,20.2,8.3,17.4,6.69,1.15,8.86,0.03,5.9,0.64,140755.0,19841.0,41940.0,2895.0,-16946.0,90470.0,20700.0,16037.0,,,14.7,92027.0,,,30/09/2025
1,BBAS3,21.9,31/10/2025,18.35,29.3,654.0,Intermediários Financeiros,125505.0,,5730830000.0,4.57,4.79,0.72,30.49,,,,,,0.0,,,,7.9,15.7,,,,,7.1,,2400620.0,,,,,,,27475.0,0.0,0.0,0.0,174734.0,20465.0,46534.0,30/06/2025
2,BBDC4,18.16,31/10/2025,10.58,18.83,508.0,Intermediários Financeiros,192351.0,,10592000000.0,8.36,2.17,1.13,16.01,,,,,,0.0,,,,7.4,13.6,,,,,-6.1,,1753560.0,,,,,,,23007.0,0.0,0.0,0.0,169590.0,22874.0,34451.0,30/09/2025
3,BBSE3,32.85,31/10/2025,29.75,40.42,167.0,Previdência e Seguros,65700.0,,2000000000.0,7.37,4.46,6.26,5.25,6.63,,,,3.07,0.0,,,,12.9,85.0,,,,,,,21377.0,0.0,0.0,0.0,0.0,0.0,9903.0,8917.0,,,46.3,10494.0,,,30/06/2025
4,CMIN3,6.07,31/10/2025,4.46,6.16,25.0,Mineração,33296.0,28196.0,5485340000.0,14.99,0.4,3.6,1.68,7.16,47.4,1.99,27.8,0.93,13.3,4.46,-3.41,23.7,4.6,24.0,4.82,1.78,6.06,1.0,-2.0,0.47,35991.0,14370.0,16982.0,9270.0,-5100.0,16746.0,4653.0,2221.0,,,12.9,9237.0,,,30/06/2025
5,CPFE3,41.6,31/10/2025,28.79,41.68,51.0,Energia Elétrica,47934.0,72655.0,1152250000.0,8.87,4.69,2.23,18.64,4.25,31.5,1.09,25.6,0.6,13.0,-21.15,-1.18,15.8,6.7,25.2,5.46,0.88,6.45,1.35,5.1,0.56,79238.0,4211.0,16106.0,28932.0,24721.0,44005.0,11271.0,5407.0,,,14.2,21482.0,,,30/06/2025
6,CPLE6,13.92,31/10/2025,8.43,13.92,134.0,Energia Elétrica,41521.0,58503.0,2982810000.0,13.64,1.02,1.62,8.58,9.75,21.8,1.74,17.8,0.68,12.7,28.94,-1.8,7.7,4.6,11.9,10.23,1.13,13.74,0.78,2.1,0.39,60742.0,2906.0,12094.0,19888.0,16982.0,23872.0,4257.0,3044.0,,,7.0,25596.0,,,30/06/2025
7,ELET6,59.39,31/10/2025,35.41,59.39,73.0,Energia Elétrica,137110.0,177562.0,2308630000.0,20.68,2.87,1.16,51.42,9.16,45.2,3.14,34.3,0.5,15.2,5.92,-1.29,6.2,6.5,5.6,9.23,1.86,11.86,0.59,8.9,0.16,274960.0,29837.0,49985.0,70290.0,40453.0,43681.0,14970.0,6632.0,,,5.4,118707.0,,,30/06/2025
8,EMBR3,86.99,31/10/2025,48.42,89.37,398.0,Material de Transporte,64413.0,69585.0,740465000.0,30.13,2.89,3.54,24.57,14.6,18.6,1.62,11.1,0.98,5.3,7.34,-6.06,8.4,0.1,11.8,11.88,1.33,15.77,0.66,13.0,0.6,65887.0,6863.0,35567.0,12034.0,5172.0,39804.0,4412.0,2138.0,,,6.7,18191.0,,,30/06/2025
9,ITUB4,39.44,31/10/2025,25.5,39.44,663.0,Intermediários Financeiros,425343.0,,10784500000.0,10.25,3.85,2.12,18.62,,,,,,0.0,,,,6.6,20.7,,,,,92.4,,479673.0,,,,,,,41495.0,164704.0,97489.0,0.0,200814.0,12573.0,8537.0,30/06/2025


In [22]:
# Autentication in Google Docs (only once)
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [23]:
# Open workbook and worksheets
wb = gc.open('Quotes')
wssfundamentus = wb.worksheet('Fundamentus')

In [24]:
# Write data in the worksheet
wssfundamentus.update([df.columns.values.tolist()] + df.fillna('').values.tolist())

{'spreadsheetId': '1qgTSxri55kYWVahW6sH3Fbn3ofWzhq93umUJhcwO7Uk',
 'updatedRange': 'Fundamentus!A1:AT26',
 'updatedRows': 26,
 'updatedColumns': 46,
 'updatedCells': 1196}