<a href="https://colab.research.google.com/github/maberf/colabs/blob/main/fundamentus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep
from IPython.display import display

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

In [None]:
def scraperFundamentus (tickers):
    """
    scraper Fundamentus
    "https://www.fundamentus.com.br/detalhes.php?papel={query_tic}"
    - Run for a list of tickers (e.g., ["PETR4"])
    return
    [type]: [pandas.core.frame.DataFrame]
    """

    # Remove ".SA" from stock dataframe tickers
    tickersadjusted = [nome.replace('.SA', '') for nome in tickers]
    # Remove "^BVSP" ticker
    if "^BVSP" in tickersadjusted:
        tickersadjusted.remove("^BVSP")
    print(tickersadjusted)

    # ---------- utilities ----------
    def parse_brazil_number(s):
        if s is None:
            return None
        if isinstance(s, (int, float)):
            return float(s)
        st = str(s).strip()
        if st in ("", "-", "n/a", "na"):
            return None
        st = st.replace("\xa0", " ").strip()
        if "%" in st:
            st = st.replace("%", "")
        st = st.replace("R$", "").strip()
        # if it's text like 'PN' or 'ON', return as-is
        if re.fullmatch(r'[A-Za-z\.\s]+', st) and not re.search(r'\d', st):
            return st
        try:
            if "." in st and "," in st:
                return float(st.replace(".", "").replace(",", "."))
            if "," in st and "." not in st:
                return float(st.replace(",", "."))
            if "." in st and "," not in st:
                return float(st.replace(".", ""))
            return float(st)
        except:
            return st

    def extract_label_value_pairs_from_tr(tr):
        cells = [c.get_text(" ", strip=True) for c in tr.find_all(['td','th'])]
        pairs = []
        i = 0
        n = len(cells)
        while i < n:
            label = cells[i].strip()
            if label == "":
                i += 1
                continue
            j = i + 1
            while j < n and cells[j].strip() == "":
                j += 1
            if j >= n:
                break
            value = cells[j].strip()
            label_norm = re.sub(r'[:\s]+$', '', label)
            pairs.append((label_norm, value))
            i = j + 1
        return pairs

    # ---------- label mapping (kept from working code) ----------
    LABEL_MAP = [
        (r'^(papel|ticker)$', 'ticker'),
        (r'cotac', 'cotacao'),
        (r'data.*ult', 'data_ultima_cotacao'),
        (r'min 52', 'min_52_sem'),
        (r'max 52', 'max_52_sem'),
        (r'vol .*2m|vol .*med', 'vol_med_2m'),
        (r'^setor$', 'setor'),
        (r'^subsetor$', 'subsetor'),
        (r'valor de mercado|valor mercado', 'valor_mercado'),
        (r'valor da firma|valor firma', 'valor_firma'),
        (r'n(ro|º)|n(ro|º).*a[cç]oes|número.*a[cç]oes|nro.*a[cç]oes', 'nro_acoes'),
        (r'p\/l|p\.?\/l', 'pl'),
        (r'lpa', 'lpa'),
        (r'p\/vp|p\.?\/vp', 'p_vp'),
        (r'vpa', 'vpa'),
        (r'marg.*ebit', 'marg_ebit'),
        (r'p\/ebit|p\.?\/ebit', 'p_ebit'),
        (r'marg.*brut|margem.*bruta', 'marg_bruta'),
        (r'psr', 'psr'),
        (r'^ebit$', 'ebit'),
        (r'p_?ativos|p\/ativos', 'p_ativos'),
        (r'marg.*liquida|margem.*liquida', 'marg_liquida'),
        (r'p[_\s]?cap[_\s]?giro|p cap giro', 'p_cap_giro'),
        (r'p[_\s]?ativ[_\s]?circ[_\s]?liq', 'p_ativ_circ_liq'),
        (r'roic', 'roic'),
        (r'div.*yield|dividend.*yield', 'dividend_yield'),
        (r'roe', 'roe'),
        (r'ev.*ebitda|ev / ebitda', 'ev_ebitda'),
        (r'liquidez corr|liquidez_corr|liquidez', 'liquidez_corr'),
        (r'ev.*ebit|ev / ebit', 'ev_ebit'),
        (r'div br.*patrim|div br patrim', 'div_br_patrim'),
        (r'cres.*rec|cres_rec_5a', 'cres_rec_5a'),
        (r'giro.*ativo|giro_ativos', 'giro_ativos'),
        (r'^ativo$', 'ativo'),
        (r'disponibilidades', 'disponibilidades'),
        (r'ativo circulante|ativo_circulante', 'ativo_circulante'),
        (r'div.*bruta', 'div_bruta'),
        (r'div.*l[ií]quida', 'div_liquida'),
        (r'patrim(o|ô)nio', 'patrimonio_liquido'),
        (r'receita liquida', 'receita_liquida_12m'),
        (r'lucro l[ií]quido', 'lucro_liquido_12m'),
        (r'empresa', 'empresa'),
        (r'oscila', 'oscilacoes'),
        (r'tipo', 'tipo')
    ]

    def normalize_label(label):
        lab = label.lower().strip()
        trans = str.maketrans("áàãâéêíóôõúüç","aaaaeeiooouuc")
        lab_no = lab.translate(trans)
        for pat, std in LABEL_MAP:
            if re.search(pat, lab_no):
                return std
        s = re.sub(r'[:\.\-\/\(\)]', ' ', lab_no)
        s = re.sub(r'[^0-9a-z\s]', '', s)
        s = re.sub(r'\s+', '_', s).strip('_')
        return s if s else label.lower()

    # ---------- page parsing ----------
    def parse_fundamentus_page(html_text):
        soup = BeautifulSoup(html_text, "html.parser")
        raw = {}
        for table in soup.find_all('table'):
            for tr in table.find_all('tr'):
                pairs = extract_label_value_pairs_from_tr(tr)
                for label, value in pairs:
                    if not label:
                        continue
                    # prioritize first non-empty
                    if label in raw and raw[label] in (None, "", "-") and value:
                        raw[label] = value
                    else:
                        raw[label] = value

        normalized = {}
        for k, v in raw.items():
            std = normalize_label(k)
            parsed = parse_brazil_number(v)
            normalized[std] = parsed

        return normalized

    # ---------- main function ----------
    def get_many_tickers_fundamentus_df(tickers, pause=0.6):
        session = requests.Session()
        session.headers.update(HEADERS)
        rows = []
        cols_union = set()

        for orig_tic in tickers:
            # normalize ticker for query: remove '.SA' and leading '^'
            query_tic = orig_tic.split('.')[0].lstrip('^').upper()
            url = f"https://www.fundamentus.com.br/detalhes.php?papel={query_tic}"
            r = session.get(url, timeout=15)
            r.raise_for_status()
            info = parse_fundamentus_page(r.text)
            # keep original requested ticker (for traceability)
            info['ticker'] = orig_tic
            rows.append(info)
            cols_union.update(info.keys())
            sleep(pause)

        # build DataFrame (columns sorted by union)
        cols = ['ticker'] + sorted([c for c in cols_union if c != 'ticker'])
        df = pd.DataFrame(rows, columns=cols)

        # ----- Remove requested columns (oscillations, dates, etc.) -----
        drop_patterns = re.compile(r'(oscil|dia\b|mes\b|30\b|2020|2021|2022|2023|2024|2025)', re.I)
        df = df[[c for c in df.columns if not drop_patterns.search(c)]]

        # ----- Field Empresa: capitalize while keeping ON, PN, N1, N2, NM uppercase -----
        if 'empresa' in df.columns:
            def format_empresa(nome):
                if not isinstance(nome, str) or nome.strip() == "":
                    return nome
                partes = nome.split()
                out = []
                for p in partes:
                    pu = p.upper()
                    if pu in ['ON','PN','N1','N2','NM']:
                        out.append(pu)
                    else:
                        out.append(p.capitalize())
                return " ".join(out)
            df['empresa'] = df['empresa'].apply(format_empresa)

        # ----- Number conversion/format: values > 1000 -> integers, avoid scientific notation -----
        for col in df.columns:
            ser = pd.to_numeric(df[col], errors='coerce')
            if ser.notna().any():
                # apply: if abs(x) >= 1000 convert to int(round(x)) and store as object to avoid scientific notation
                def conv(x):
                    if pd.isna(x):
                        return x
                    try:
                        if abs(x) >= 1000:
                            return int(round(x))
                        # keep floats (indicators) with 2 decimals when not large integers
                        if float(x).is_integer():
                            return int(x)
                        return float(x)
                    except:
                        return x
                df[col] = ser.apply(conv)
                df[col] = df[col].astype(object)
            else:
                df[col] = df[col].astype(object)

        # ----- Reorder columns as requested (best-effort) -----
        desired_order = [
            "ticker","cotacao","data_ultima_cotacao","min_52_sem","max_52_sem","vol_med_2m",
            "setor","valor_mercado","valor_firma","nro_acoes","pl","lpa","p_vp","vpa","p_ebit",
            "marg_bruta","psr","marg_ebit","p_ativos","marg_liquida","p_cap_giro","p_ativ_circ_liq",
            "roic","dividend_yield","roe","ev_ebitda","liquidez_corr","ev_ebit","div_br_patrim",
            "cres_rec_5a","giro_ativos","ativo","disponibilidades","ativo_circulante","div_bruta",
            "div_liquida","patrimonio_liquido","receita_liquida_12m","ebit","lucro_liquido_12m"
        ]
        final_cols = []
        existing = list(df.columns)
        used = set()
        for want in desired_order:
            match = None
            if want in existing:
                match = want
            else:
                for c in existing:
                    if c in used:
                        continue
                    cl = c.lower()
                    if want.replace('_',' ') in cl or all(tok in cl for tok in want.split('_') if tok):
                        match = c
                        break
            if match:
                final_cols.append(match)
                used.add(match)
        for c in existing:
            if c not in used:
                final_cols.append(c)
        fundamentus = df.reindex(columns=final_cols)

        # format display options to avoid scientific notation
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 400)
        pd.set_option('display.float_format', lambda x: '%.0f' % x if abs(x) >= 1000 else ('%.2f' % x))

        return fundamentus

    df = get_many_tickers_fundamentus_df(tickersadjusted)

    return df