In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# ---------- CONFIG ----------
REQ_HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                             "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
TIMEOUT = 25
DELAY = 1.5
VERBOSE = True  # set False to reduce prints

# ---------- SCHEMA ----------
FIELDS_SINGLE = [
    "Company_Name", "IPO Date", "Listing Date", "Face Value", "Issue Price",
    "Lot Size", "Sale Type", "Total Issue Size", "Fresh Issue", "Offer for Sale",
    "Employee Discount", "Issue Type", "Listing At",
    "Share Holding Pre Issue", "Share Holding Post Issue",
    "Promoter Holding Pre Issue", "Promoter Holding Post Issue",
    "ROE", "ROCE", "Debt/Equity", "RoNW", "PAT Margin",
    "Price to Book Value", "BSE Script Code", "NSE Symbol", "ISIN",
    "Final Issue Price", "NSE Listing Group", "BSE Listing Group"
]

FIELDS_MULTI = [
    "EPS_PreIPO", "EPS_PostIPO", "PE_PreIPO", "PE_PostIPO"
]

FIELDS_BIDDING = [
    "QIB_Subs", "QIB_SharesOffered",
    "NII_Subs", "NII_SharesOffered",
    "Retail_Subs", "Retail_SharesOffered",
    "Employee_Subs", "Employee_SharesOffered",
    "Others_Subs", "Others_SharesOffered"
]

FIELDS_LISTING = [
    "BSE_Open", "BSE_High", "BSE_Low", "BSE_Close",
    "NSE_Open", "NSE_High", "NSE_Low", "NSE_Close"
]

ALL_FIELDS = FIELDS_SINGLE + FIELDS_MULTI + FIELDS_BIDDING + FIELDS_LISTING

# ---------- KEY MAPPINGS / NORMALIZATION ----------
# Map many possible label texts (from Chittorgarh) to canonical field names
KEY_MAP = {
    # common 2-col keys (lowercased)
    "ipo open date": "IPO Date",  # sometimes combined
    "ipo date": "IPO Date",
    "listing date": "Listing Date",
    "face value": "Face Value",
    "face value (rs)": "Face Value",
    "price band / issue price": "Issue Price",
    "issue price": "Issue Price",
    "lot size": "Lot Size",
    "sale type": "Sale Type",
    "total issue size": "Total Issue Size",
    "fresh issue": "Fresh Issue",
    "offer for sale": "Offer for Sale",
    "employee discount": "Employee Discount",
    "issue type": "Issue Type",
    "listing at": "Listing At",
    "shareholding pre-issue": "Share Holding Pre Issue",
    "shareholding post-issue": "Share Holding Post Issue",
    "promoter holding (pre issue)": "Promoter Holding Pre Issue",
    "promoter holding (post issue)": "Promoter Holding Post Issue",
    "roe (%)": "ROE",
    "roce (%)": "ROCE",
    "debt/equity": "Debt/Equity",
    "ronw (%)": "RoNW",
    "pat margin (%)": "PAT Margin",
    "price to book value": "Price to Book Value",
    "bse script code": "BSE Script Code",
    "nse symbol": "NSE Symbol",
    "isin": "ISIN",
    "final issue price": "Final Issue Price",
    "nse listing group": "NSE Listing Group",
    "bse listing group": "BSE Listing Group",
    # alternate phrasings
    "ipo open date": "IPO Date",
    "ipo opens on": "IPO Date",
    "ipo closes on": "IPO Date",
    "issue opens on": "IPO Date",
    "issue closes on": "IPO Date",
}

# Helper to normalize text for matching
def norm(s):
    if not isinstance(s, str):
        return ""
    return re.sub(r"\s+", " ", s.replace("\xa0", " ")).strip()

def lc(s):
    return norm(s).lower()

# ---------- HTML helpers ----------
def get_soup(url):
    s = requests.Session()
    s.headers.update(REQ_HEADERS)
    r = s.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def debug(*args, **kwargs):
    if VERBOSE:
        print(*args, **kwargs)

# ---------- Table parsing utilities ----------
def table_to_matrix(table):
    """Return list of rows, each is list of cell texts (basic, ignores colspan/rowspan complexity)."""
    rows = []
    for tr in table.find_all("tr"):
        cells = []
        for cell in tr.find_all(["th", "td"]):
            txt = norm(cell.get_text(separator=" ", strip=True))
            cells.append(txt)
        if cells:
            rows.append(cells)
    return rows

def find_header_indices(header_row, tokens):
    """
    header_row: list of header texts
    tokens: list of substrings to find (lowercased)
    returns dict token -> index (first matching index), or -1 if not found
    """
    hmap = {}
    for t in tokens:
        found = -1
        for i, h in enumerate(header_row):
            if t in lc(h):
                found = i
                break
        hmap[t] = found
    return hmap

# ---------- Core scraper ----------
def scrape_chittorgarh_single(url):
    out = {f: None for f in ALL_FIELDS}
    try:
        soup = get_soup(url)
        debug("==== URL:", url)

        # Company Name from <h1>
        h1 = soup.find("h1")
        if h1:
            out["Company_Name"] = norm(h1.get_text())

        # Find all tables on the page
        tables = soup.find_all("table", class_="table")
        debug(f"Found {len(tables)} tables")

        for idx, table in enumerate(tables):
            mat = table_to_matrix(table)
            if not mat:
                continue
            debug(f"Table #{idx+1} preview (first 3 rows):", mat[:3])

            # 1) Try 2-column key: value tables (common)
            is_two_col = all(len(row) == 2 for row in mat)
            if is_two_col:
                for row in mat:
                    key_raw, val = row[0], row[1]
                    kl = lc(key_raw)
                    # direct key map
                    if kl in KEY_MAP:
                        out[KEY_MAP[kl]] = val
                        continue
                    # try partial matching
                    matched = False
                    for k_alias, canon in KEY_MAP.items():
                        if k_alias in kl:
                            out[canon] = val
                            matched = True
                            break
                    if matched:
                        continue
                    # direct match with FIELDS_SINGLE by exact label
                    if key_raw in FIELDS_SINGLE:
                        out[key_raw] = val
                        continue
                    # relaxed checks (examples: "Fresh Issue (shares)" etc)
                    if "fresh issue" in kl and out.get("Fresh Issue") is None:
                        out["Fresh Issue"] = val
                    if "offer for sale" in kl and out.get("Offer for Sale") is None:
                        out["Offer for Sale"] = val
                    if "lot size" in kl and out.get("Lot Size") is None:
                        out["Lot Size"] = val
                    if "final issue price" in kl and out.get("Final Issue Price") is None:
                        out["Final Issue Price"] = val
                # done with this table
                continue

            # 2) Multi-column tables: analyze header / first row for hints
            header = mat[0]
            header_lc = [lc(h) for h in header]

            # --- EPS / P/E table detection ---
            # If any cell contains 'eps' or 'p/e' or 'p/e (x)' or header includes Pre/Post
            header_join = " ".join(header_lc)
            if any("eps" in h for h in header_lc) or any("p/e" in h for h in header_lc) or "eps" in header_join:
                # look for rows that have label + two numeric columns
                for row in mat[1:]:
                    if len(row) >= 3:
                        label = lc(row[0])
                        pre = row[1]
                        post = row[2]
                        if "eps" in label or "earnings per share" in label:
                            out["EPS_PreIPO"] = pre
                            out["EPS_PostIPO"] = post
                        elif "p/e" in label or "pe" in label or "p/e (x)" in label:
                            out["PE_PreIPO"] = pre
                            out["PE_PostIPO"] = post
                continue

            # --- Shareholding pattern detection (Promoter row) ---
            # If header has 'pre' & 'post' or 'pre issue' 'post issue'
            if any("pre" in h or "post" in h for h in header_lc):
                # try find row where first col contains 'promoter' or 'promoters'
                for row in mat[1:]:
                    if len(row) >= len(header):
                        first = lc(row[0])
                        if "promoter" in first:
                            # find index of header with 'pre' and 'post'
                            pre_idx = -1
                            post_idx = -1
                            for i, h in enumerate(header_lc):
                                if "pre" in h and pre_idx == -1:
                                    pre_idx = i
                                if "post" in h and post_idx == -1:
                                    post_idx = i
                            if pre_idx != -1 and post_idx != -1:
                                out["Promoter Holding Pre Issue"] = row[pre_idx]
                                out["Promoter Holding Post Issue"] = row[post_idx]
                                # also set generic shareholding if not set
                                if not out["Share Holding Pre Issue"]:
                                    out["Share Holding Pre Issue"] = row[pre_idx]
                                if not out["Share Holding Post Issue"]:
                                    out["Share Holding Post Issue"] = row[post_idx]
                                break
                # continue to next table after trying shareholding
                continue

            # --- Bidding table detection ---
            # Look for keywords in header: 'category' and 'subscription' and 'shares'
            if any("subscription" in h for h in header_lc) and any("category" in h for h in header_lc):
                # find indices
                idx_cat = next((i for i, h in enumerate(header_lc) if "category" in h), None)
                idx_subs = next((i for i, h in enumerate(header_lc) if "subscription" in h), None)
                idx_off = next((i for i, h in enumerate(header_lc) if "share" in h and "offer" in h or ("shares" in h and "offered" in h) or "shares offered" in h), None)
                # fallback: match any 'shares' for offered if exact not found
                if idx_off is None:
                    idx_off = next((i for i, h in enumerate(header_lc) if "shares" in h), None)
                if idx_cat is None or idx_subs is None:
                    # header may be in second row or header row absent; try heuristic: find columns by name search across first two rows
                    pass

                # parse rows
                for row in mat[1:]:
                    if idx_cat is None:
                        continue
                    cat = lc(row[idx_cat]) if idx_cat < len(row) else ""
                    subs = row[idx_subs] if (idx_subs is not None and idx_subs < len(row)) else None
                    off = row[idx_off] if (idx_off is not None and idx_off < len(row)) else None

                    if "qib" in cat:
                        out["QIB_Subs"] = subs
                        out["QIB_SharesOffered"] = off
                    elif "nii" in cat or "hni" in cat or "non institutional" in cat:
                        out["NII_Subs"] = subs
                        out["NII_SharesOffered"] = off
                    elif "retail" in cat:
                        out["Retail_Subs"] = subs
                        out["Retail_SharesOffered"] = off
                    elif "employee" in cat:
                        out["Employee_Subs"] = subs
                        out["Employee_SharesOffered"] = off
                    else:
                        # catch-all: put in Others if not matched and category non-empty
                        if cat and "total" not in cat:
                            out["Others_Subs"] = subs
                            out["Others_SharesOffered"] = off
                continue

            # --- Listing day Price Details detection ---
            # Table with columns like Price Details | BSE | NSE
            if any("price details" in h for h in header_lc) or (("bse" in header_lc or "nse" in header_lc) and len(header_lc) >= 2):
                # Find idx for bse and nse
                idx_bse = next((i for i, h in enumerate(header_lc) if "bse" in h), None)
                idx_nse = next((i for i, h in enumerate(header_lc) if "nse" in h), None)
                # rows are usually: Open, High, Low, Close
                for row in mat[1:]:
                    if len(row) >= 3:
                        label = lc(row[0])
                        bse_val = row[idx_bse] if (idx_bse is not None and idx_bse < len(row)) else None
                        nse_val = row[idx_nse] if (idx_nse is not None and idx_nse < len(row)) else None
                        if "open" in label:
                            out["BSE_Open"], out["NSE_Open"] = bse_val, nse_val
                        elif "high" in label:
                            out["BSE_High"], out["NSE_High"] = bse_val, nse_val
                        elif "low" in label:
                            out["BSE_Low"], out["NSE_Low"] = bse_val, nse_val
                        elif "close" in label or "closing" in label:
                            out["BSE_Close"], out["NSE_Close"] = bse_val, nse_val
                continue

            # If not matched, skip this table (we intentionally do not explode into many columns)
            debug("Skipped table (no matching pattern)")

    except Exception as e:
        debug("ERROR scraping", url, ":", e)
    return out

# ---------- BATCH RUN ----------
def scrape_multiple(urls, out_csv="ipo_chittorgarh_multi.csv"):
    rows = []
    for u in urls:
        debug("\n--- scraping:", u)
        row = scrape_chittorgarh_single(u)
        rows.append(row)
        time.sleep(DELAY)
    df = pd.DataFrame(rows, columns=ALL_FIELDS)
    df.to_csv(out_csv, index=False)
    print(f"Saved {len(df)} rows to {out_csv}")
    return df

# ---------- Example (Afcons) ----------
if __name__ == "__main__":
    urls = [
        "https://www.chittorgarh.com/ipo/afcons-infrastructure-ipo/1881/"
    ]
    # set VERBOSE True to see table previews / debug
    VERBOSE = True
    df = scrape_multiple(urls)
    print(df.T)



--- scraping: https://www.chittorgarh.com/ipo/afcons-infrastructure-ipo/1881/
==== URL: https://www.chittorgarh.com/ipo/afcons-infrastructure-ipo/1881/
Found 15 tables
Table #1 preview (first 3 rows): [['']]
Skipped table (no matching pattern)
Table #2 preview (first 3 rows): [['']]
Skipped table (no matching pattern)
Table #3 preview (first 3 rows): [['IPO Date', 'October 25, 2024 to October 29, 2024'], ['Listing Date', 'November 4, 2024'], ['Face Value', '₹ 10 per share']]
Table #4 preview (first 3 rows): [['Investor Category', 'Shares Offered', 'Maximum Allottees'], ['QIB Shares Offered', '5,83,69,329 (49.75%)', 'NA'], ['− Anchor Investor Shares Offered', '3,50,21,597 (29.85%)', 'NA']]
Skipped table (no matching pattern)
Table #5 preview (first 3 rows): [['Application Category', 'Maximum Bidding Limits', 'Bidding at Cut-off Price Allowed'], ['Only RII', 'Up to Rs 2 Lakhs', 'Yes'], ['Only sNII', 'Rs 2 Lakhs to Rs 10 Lakhs', 'No']]
Skipped table (no matching pattern)
Table #6 preview

In [3]:
df.T

Unnamed: 0,0
Company_Name,Afcons Infrastructure Ltd. IPO (Afcons Infrast...
IPO Date,"Fri, Oct 25, 2024"
Listing Date,"November 4, 2024"
Face Value,₹ 10 per share
Issue Price,₹463 per share
Lot Size,32 Shares
Sale Type,Fresh Capital-cum-Offer for Sale
Total Issue Size,"11,73,27,139 shares (aggregating up to ₹ 5,430..."
Fresh Issue,"2,70,46,362 shares (aggregating up to ₹ 1,251...."
Offer for Sale,"9,02,80,777 shares of ₹ 10 (aggregating up to ..."
