#### PART 1
#### Inspect the values for each financial metric

#### Part 2 - Query against Financial Metrics

In [8]:
import time
import polars as pl
from datetime import datetime, date
from typing import Union
from utils.utils import run_query, run_query_to_polars_simple, run_query_debug, run_query_to_polars_simple1, run_query_to_polars_simple2
pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1) 

####################################################### EXAMPLE VALUES
max_constituents = 100
min_volume_eur = 100000
selected_countries = ['US', 'CN']
selected_sectors = ['Technology']
selected_industries = ['Software - Application','Media & Entertainment','Semiconductors', 'Information Technology Services']
selected_stocks = ['']

kpis = {
    'price_to_earnings_ratio_perc': [60, 70, 80, 90, 99, 100],
    'gross_profit_margin_perc': [],
    'net_profit_margin_perc': [],
}

index_start_date = "2015-03-15"
index_end_date = "2025-08-31"
index_currency = "EUR"
#######################################################




def make_query(max_constituents, 
               selected_countries, 
               selected_sectors, 
               selected_industries, 
               selected_stocks,
               kpis):
    
    industries = "(" + ", ".join(f"'{i}'" for i in selected_industries) + ")"
    sectors = "(" + ", ".join(f"'{s}'" for s in selected_sectors) + ")"
    countries = "(" + ", ".join(f"'{c}'" for c in selected_countries) + ")"
    
    # Handle empty selected_stocks list to avoid SQL syntax error
    if selected_stocks and len(selected_stocks) > 0:
        selected_stocks_sql = "(" + ", ".join(f"'{c}'" for c in selected_stocks) + ")"
        stocks_condition = f"OR symbol IN {selected_stocks_sql}"
    else:
        stocks_condition = ""


    # If no KPIs provided, select everything basicaly
    if not kpis:
        kpis = {
            'asset_turnover_perc': ['1', '20', '30', '40', '50', '60', '70', '80', '90', '99', '100']
        }

    kpi_filters = []
    for kpi, values in kpis.items():
        if values:
            quoted_values = [f"'{v}'" for v in values]
            kpi_filters.append(f"AND {kpi} IN ({', '.join(quoted_values)})")

    kpi_sql = "\n".join(kpi_filters)
    active_kpis = [kpi for kpi, values in kpis.items() if values]
    kpi_cols        = ", ".join(active_kpis)
    prep3_kpi_cols  = ", ".join(f"p3.{kpi}" for kpi in active_kpis)
    #prep6_kpi_cols  = ", ".join(f"prep6.{kpi}" for kpi in active_kpis)
    prep6_kpi_cols = ", ".join(f"CAST(prep6.{kpi} AS FLOAT8) AS {kpi}" for kpi in active_kpis)

    import time

    query = f"""
    SET enable_mergejoin = off;
    WITH prep1 AS (
        SELECT symbol
        FROM raw.stock_info 
        WHERE (country IN {countries}
        AND industry IN {industries}
        AND sector IN {sectors})
        {stocks_condition}
    ),
    prep2 AS (
        SELECT *
        FROM clean.financial_metrics_perc
        WHERE 1=1
        {kpi_sql}
        {stocks_condition}
    ),
    prep3 AS (
        SELECT 
            p2.symbol, p2.date, p2.fiscal_year, p2.period, p2.reported_currency,
            {kpi_cols}
        FROM prep2 p2
        INNER JOIN prep1 p1 ON p2.symbol = p1.symbol
    ),
    prep4 AS (
        SELECT 
    hmc.*,
    'Q' || (
        CASE 
            WHEN EXTRACT(YEAR FROM hmc.date)::INT = 2013 THEN 4
            WHEN EXTRACT(QUARTER FROM hmc.date)::INT = 4 THEN 1
            ELSE EXTRACT(QUARTER FROM hmc.date)::INT + 1
        END
    ) AS next_quarter,
    CASE 
        WHEN EXTRACT(YEAR FROM hmc.date)::INT = 2013 THEN 2013
        WHEN EXTRACT(QUARTER FROM hmc.date)::INT = 4 THEN EXTRACT(YEAR FROM hmc.date)::INT + 1
        ELSE EXTRACT(YEAR FROM hmc.date)::INT
    END AS next_year,
            {prep3_kpi_cols}
        FROM raw.historical_market_cap hmc
        INNER JOIN prep3 p3
        ON hmc.symbol = p3.symbol
        AND hmc.year = p3.fiscal_year
        AND hmc.quarter = p3.period
        WHERE hmc.last_quarter_date = TRUE
    ),
    prep5 AS (
        SELECT 
            p4.*, 
            RANK() OVER (
                PARTITION BY p4.year, p4.quarter 
                ORDER BY p4.market_cap_eur DESC
            ) AS mcap_rank
        FROM prep4 p4
    ),
    prep6 AS (
        SELECT *
        FROM prep5
        WHERE mcap_rank <= {max_constituents}
        {f"OR symbol IN {selected_stocks_sql}" if selected_stocks and len(selected_stocks) > 0 else ""}
    ),
    prep8 AS (
        SELECT 
            prep7.date,
            prep7.symbol,
            prep7.currency,
            prep7.year,
            prep7.quarter,
            cast(prep7.last_quarter_date as BOOLEAN) as last_quarter_date,
            CAST(prep7.close as FLOAT8) as close ,
            CAST(prep7.close_eur as FLOAT8) as close_eur,
            CAST(prep7.close_usd as FLOAT8) as close_usd,
            CAST(prep6.market_cap as FLOAT8) as market_cap,
            CAST(prep6.market_cap_eur as FLOAT8) as market_cap_eur, 
            CAST(prep6.market_cap_usd as FLOAT8) as market_cap_usd,
            {prep6_kpi_cols},
            CAST(prep6.mcap_rank as INTEGER) as mcap_rank
        FROM raw.historical_price_volume prep7
        INNER JOIN prep6
        ON prep7.symbol = prep6.symbol
        AND prep7.year = prep6.next_year
        AND prep7.quarter = prep6.next_quarter
        WHERE volume_eur > 100000
    )
    SELECT *
    FROM prep8
    --WHERE (EXTRACT(DOW FROM date) = 1 OR last_quarter_date = TRUE)
    """
    df = run_query_to_polars_simple(query)
    return df

########################################################
########################################################
########################################################


def build_constituent_weights_dict(rebalance_snapshots: pl.DataFrame, mcap_col: str) -> dict:
    weights_by_year_quarter: dict = {}
    for row in rebalance_snapshots.sort(["year", "quarter"]).iter_rows(named=True):
        year = row["year"]
        quarter = row["quarter"]
        symbols = row["symbol"]
        mcaps = row[mcap_col]
        if symbols is None or mcaps is None or len(symbols) == 0:
            continue
        total_mcap = float(sum(mcaps)) if mcaps is not None else 0.0
        if total_mcap <= 0.0:
            continue
        pairs = [
            {"symbol": s, "weight": float(m) / total_mcap}
            for s, m in zip(symbols, mcaps)
        ]
        pairs.sort(key=lambda x: x["weight"], reverse=True)
        if year not in weights_by_year_quarter:
            weights_by_year_quarter[year] = {}
        weights_by_year_quarter[year][quarter] = pairs
    return weights_by_year_quarter


def make_index(df: pl.DataFrame, 
                           index_start_date: Union[date, str, None] = "2014-01-01",
                           index_end_date: Union[date, str, None] = None,
                           index_currency: str = "EUR",
                           index_start_amount: float = 1000.0):
    
    # Step 0: Cast decimals to floats
    decimal_cols = [name for name, dtype in zip(df.columns, df.dtypes) if dtype.base_type() == pl.Decimal]
    df = df.with_columns([pl.col(c).cast(pl.Float64) for c in decimal_cols])

    # Step 1: Parse date column
    if df.schema["date"] == pl.Utf8:
        df = df.with_columns(pl.col("date").str.to_date())
    elif df.schema["date"] != pl.Date:
        df = df.with_columns(pl.col("date").cast(pl.Date))

    # Step 2: Forward-fill prices based on currency
    price_col = "close_eur" if index_currency == "EUR" else "close_usd"
    mcap_col  = "market_cap_eur" if index_currency == "EUR" else "market_cap_usd"

    df = (
        df.sort(["symbol", "date"])
          .with_columns([
              pl.col(price_col).forward_fill().over("symbol")
          ])
    )

    # Step 3: Rebalance snapshots
    rebalance_df = df.filter(pl.col("last_quarter_date") == True)
    print(f"Rebalance df: {rebalance_df}")

    # Use appropriate market cap column based on currency
    
    rebalance_snapshots = (
        rebalance_df
        .group_by(["year", "quarter"])
        .agg([
            pl.first("date").alias("rebalance_date"),
            pl.col("symbol"),
            pl.col(mcap_col)
        ])
    )
    constituent_weights_by_year_quarter = build_constituent_weights_dict(rebalance_snapshots, mcap_col)
    # Optional: brief debug output
    if constituent_weights_by_year_quarter:
        some_year = next(iter(constituent_weights_by_year_quarter))
        some_quarter = next(iter(constituent_weights_by_year_quarter[some_year]))
        print(f"Sample weights for {some_year} {some_quarter} (top 5):",
              constituent_weights_by_year_quarter[some_year][some_quarter][:5])

########################################################
########################################################

    # Step 4: Parse index_start_date and index_end_date
    if isinstance(index_start_date, str):
        index_start_date = datetime.strptime(index_start_date, "%Y-%m-%d").date()

    if isinstance(index_end_date, str):
        index_end_date = datetime.strptime(index_end_date, "%Y-%m-%d").date()

    # Step 5: Find the latest rebalance on or before index_start_date
    initial_snapshot = rebalance_snapshots.filter(pl.col("rebalance_date") <= index_start_date)
    if initial_snapshot.is_empty():
        raise ValueError(f"No rebalance snapshot found on or before {index_start_date}")

    initial_row = initial_snapshot.sort("rebalance_date", descending=True).row(0, named=True)
    initial_weights = {
        s: float(m) / float(sum(initial_row[mcap_col]))
        for s, m in zip(initial_row["symbol"], initial_row[mcap_col])
    }
    active_weights = initial_weights
    last_rebalance_date = initial_row["rebalance_date"]

    # Step 6: Store future quarterly rebalance dates > index_start_date
    future_rebalances = {
        row["rebalance_date"]: {
            s: float(m) / float(sum(row[mcap_col]))
            for s, m in zip(row["symbol"], row[mcap_col])
        }
        for row in rebalance_snapshots.iter_rows(named=True)
        if row["rebalance_date"] > index_start_date
    }

    # Step 7: Pivot prices using appropriate currency column
    pivot = (
        df.with_columns([
            pl.col("date").cast(pl.Utf8)
        ])
        .select(["date", "symbol", price_col])
        .pivot(index="date", values=price_col, on="symbol", aggregate_function="first")
        .with_columns([
            pl.col("date").str.to_date()
        ])
        .sort("date")
        .with_columns(
            pl.all().exclude("date").fill_null(strategy="forward")
        )
    )

    pivot = pivot.filter(pl.col("date") >= pl.lit(index_start_date))

    print(pivot)

    # Step 8: Main loop
    index_value = float(index_start_amount)
    index_values = []
    previous_prices = None
    symbol_columns = [c for c in pivot.columns if c != "date"]

    for row in pivot.iter_rows(named=True):
        current_date = row["date"]
        price_row = {s: row[s] for s in symbol_columns}

        # Check if rebalancing today
        if current_date in future_rebalances:
            active_weights = future_rebalances[current_date]
            previous_prices = {s: price_row[s] for s in active_weights if price_row[s] is not None}
            index_values.append((current_date, index_value))
            continue

        # Initialize previous prices on the start date
        if previous_prices is None:
            previous_prices = {s: price_row[s] for s in active_weights if price_row[s] is not None}
            index_values.append((current_date, index_value))
            continue

        # Compute daily return
        returns = {}
        for symbol, prev_price in previous_prices.items():
            current_price = price_row.get(symbol)
            if current_price is not None and prev_price > 0:
                returns[symbol] = current_price / prev_price
            else:
                returns[symbol] = 1.0

        daily_return = sum(
            active_weights[s] * returns.get(s, 1.0) for s in active_weights
        )
        index_value *= daily_return

        # Update previous prices
        for s in previous_prices:
            if price_row.get(s) is not None:
                previous_prices[s] = price_row[s]

        index_values.append((current_date, index_value))

    result_df = pl.DataFrame(index_values, schema=["date", "index_value"], orient="row").sort("date")
    if index_end_date is not None:
        # Cut off the final dataframe at index_end_date (inclusive)
        result_df = result_df.filter(pl.col("date") <= pl.lit(index_end_date))
    return result_df, constituent_weights_by_year_quarter


########################################################
########################################################
########################################################




ImportError: cannot import name 'run_query_debug' from 'utils.utils' (/Users/aleksamihajlovic/Documents/naro-index-advisor/stock-service/src/utils/utils.py)

In [5]:
import pandas as pd
df = pd.read_csv('utils/fields/benchmarks.csv')

In [6]:
df

Unnamed: 0,name,symbol,type,date,return_eur,return_usd,risk_eur,risk_usd
0,SPDR S&P 500 ETF Trust,SPY,etf,2014-01-02,0.1367,0.1176,0.0354,0.0549
1,VanEck Semiconductor ETF,SMH,etf,2014-01-02,0.2921,0.2766,0.0713,0.0942
2,1/100 Dow Jones Industrial Average,^DJX,index,2014-01-02,0.1168,0.0979,0.0345,0.0446
3,13 Week Treasury Bill Index,^IRX,index,2014-01-02,11.0800,10.1279,0.3399,0.3386
4,AEX Index,^AEX,index,2014-01-02,0.0839,0.0701,0.0538,0.0751
...,...,...,...,...,...,...,...,...
192,TSEC Weighted Index,^TWII,index,2014-01-02,0.1216,0.1075,0.0697,0.0816
193,US Dollar Index,DX-Y.NYB,index,2014-01-02,0.0502,0.0227,0.0618,0.0300
194,WIG20 Index,WIG20.WA,index,2014-01-02,0.0305,0.0237,0.1072,0.1165
195,WIG Index,WIG.WA,index,2014-01-02,0.0776,0.0696,0.0900,0.1019


In [14]:
df.return_eur.max()

np.float64(11.08)