#### PART 1
#### Inspect the values for each financial metric

In [None]:
import psycopg2
import pandas as pd
from utils.utils import get_postgres_connection


def read_financial_metrics():
    """Simple function to read financial_metrics table and return as pandas DataFrame"""
    conn = get_postgres_connection()
    query = "SELECT * FROM raw.financial_metrics"
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df


df = read_financial_metrics()
print(f"Read {len(df)} records from financial_metrics table")
print(df.head())


In [None]:
import pandas as pd
import numpy as np
from IPython.display import display_html

def _safe_numeric_series(s: pd.Series) -> pd.Series:
    """Keep only finite numeric values."""
    s = pd.to_numeric(s, errors="coerce")
    s = s.replace([np.inf, -np.inf], np.nan).dropna()
    return s

def _summary_with_custom_percentiles(s: pd.Series, percentiles=None) -> pd.DataFrame:
    if percentiles is None:
        # include 5th, 95th and deciles 10..90
        percentiles = [0.001, 0.01,0.05] + [i/100 for i in range(10, 100, 10)] + [0.95, 0.99, 0.999]
    desc = s.describe(percentiles=percentiles)
    out = desc.reset_index()
    out.columns = ["Statistic", "Value"]
    return out

def _bucket_counts_qcut(s: pd.Series, q=10) -> pd.DataFrame:
    """Counts per percentile bucket, safe against duplicates and constants."""
    n = len(s)
    if n == 0:
        return pd.DataFrame({"Percentile Bucket": [], "Count": []})
    if s.nunique(dropna=True) < 2:
        return pd.DataFrame({"Percentile Bucket": ["All values equal"], "Count": [n]})

    try:
        buckets = pd.qcut(s, q=q, labels=[f"{i*100//q}-{(i+1)*100//q}%" for i in range(q)], duplicates="drop")
        counts = buckets.value_counts().sort_index().reset_index()
        counts.columns = ["Percentile Bucket", "Count"]
        return counts
    except Exception:
        qs = np.linspace(0, 1, q + 1)
        edges = np.unique(s.quantile(qs).values)
        if len(edges) < 2:
            return pd.DataFrame({"Percentile Bucket": ["All values equal"], "Count": [n]})
        labels = [f"{int(qs[i]*100)}-{int(qs[i+1]*100)}%" for i in range(len(edges)-1)]
        buckets = pd.cut(s, bins=edges, include_lowest=True, labels=labels, duplicates="drop")
        counts = buckets.value_counts().sort_index().reset_index()
        counts.columns = ["Percentile Bucket", "Count"]
        return counts

def _display_side_by_side(dfs: list, titles: list):
    html = ""
    for df, title in zip(dfs, titles):
        html += (
            "<div style='display:inline-block; padding-right:30px; vertical-align:top;'>"
            f"<h3 style='margin:4px 0 8px 0;'>{title}</h3>"
            f"{df.to_html(index=False)}"
            "</div>"
        )
    display_html(html, raw=True)

def describe_all_numeric_with_buckets(df: pd.DataFrame, q=10, percentiles=None, max_cols=None):
    """
    For each numeric column:
      - show summary with custom percentiles (5th, 10..90, 95th by default)
      - show counts per percentile bucket
    """
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if max_cols is not None:
        num_cols = num_cols[:max_cols]

    if not num_cols:
        print("No numeric columns found.")
        return

    for col in num_cols:
        s = _safe_numeric_series(df[col])
        if s.empty:
            print(f"ðŸ“­ Column: {col} â€” skipped (no finite numeric values).")
            continue

        stats = _summary_with_custom_percentiles(s, percentiles=percentiles)
        buckets = _bucket_counts_qcut(s, q=q)

        print(f"ðŸ“Š Column: {col} (n={len(s)})")
        _display_side_by_side(
            [stats, buckets],
            ["Summary Statistics", f"Counts per Percentile Bucket ({q} buckets)"]
        )
        print()  # spacing


# ---- Run it ----
# This will include 5th, 10..90, 95th percentiles + decile bucket counts
describe_all_numeric_with_buckets(df, q=10)


#### Part 2 - Query against Financial Metrics

In [43]:


HOST = "91.107.196.130"
DATABASE = "naro_index_db"
USERNAME = "naro_user"
PASSWORD = "naro_password"
PORT = 5432

def get_remote_postgres_connection():
    """Get a direct PostgreSQL connection to the remote database"""
    return psycopg2.connect(
        dbname=DATABASE,
        user=USERNAME,
        password=PASSWORD,
        host=HOST,
        port=PORT
    )

In [None]:
import psycopg2
import pandas as pd
pd.set_option("display.max_rows", None)
from utils.utils import get_postgres_connection

def run_query(query):
    conn = get_remote_postgres_connection()
    query = query
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

max_constituents = 100
user_country_list = ['US']
user_sector_list = ['Technology']
user_industry_list = ['Software - Applicatio']


perc_list_all = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99, 100]
kpis = {
    'price_to_earnings_ratio_perc': [60, 70, 80, 90, 100],
    'gross_profit_margin_perc': [],
    'net_profit_margin_perc': [],
}

kpi_filters = [
    f"AND {kpi} IN ({', '.join(map(str, values))})"
    for kpi, values in kpis.items() if values
]

kpi_sql = "\n".join(kpi_filters)
active_kpis = [kpi for kpi, values in kpis.items() if values]
kpi_cols        = ", ".join(active_kpis)
prep3_kpi_cols  = ", ".join(f"prep3.{kpi}" for kpi in active_kpis)
prep6_kpi_cols  = ", ".join(f"prep6.{kpi}" for kpi in active_kpis)


#######

country_query = "SELECT DISTINCT country FROM raw.stock_info WHERE COUNTRY IS NOT NULL"
country_df = run_query(country_query)
country_list_all = list(country_df.country)
country_list_filtered = [country for country in country_list_all if country in user_country_list]
selected_countries = country_list_filtered if country_list_filtered else country_list_all
countries = "(" + ", ".join(f"'{c}'" for c in selected_countries) + ")"
#######

sector_query = "SELECT DISTINCT sector FROM raw.stock_info WHERE SECTOR IS NOT NULL"
sector_df = run_query(sector_query)
sector_list_all = list(sector_df.sector)
sector_list_filtered = [sector for sector in sector_list_all if sector in user_sector_list]
selected_sectors = sector_list_filtered if sector_list_filtered else sector_list_all
sectors = "(" + ", ".join(f"'{s}'" for s in selected_sectors) + ")"
#######

industry_query = "SELECT DISTINCT industry FROM raw.stock_info WHERE INDUSTRY IS NOT NULL"
industry_df = run_query(industry_query)
industry_list_all = list(industry_df.industry)
industry_list_filtered = [industry for industry in industry_list_all if industry in user_industry_list]
selected_industries = industry_list_filtered if industry_list_filtered else industry_list_all
industries = "(" + ", ".join(f"'{i}'" for i in selected_industries) + ")"
#######


# perc_list_filtered = [perc for perc in perc_list_all if perc in user_price_to_earnings_ratio_perc_list]
# selected_percs = perc_list_filtered if perc_list_filtered else perc_list_all
# price_to_earnings_ratio_perc_list = "(" + ", ".join(f"'{p}'" for p in selected_percs) + ")"



query = f"""
WITH prep1 AS (
    SELECT symbol
    FROM raw.stock_info 
    WHERE 1=1
    AND country in {countries}
    AND industry in {industries}
    AND sector in {sectors}
),
prep2 AS (
    SELECT *
    FROM clean.financial_metrics_perc
    WHERE 1=1
    {kpi_sql}
),
prep3 as(
    SELECT 
    symbol, date, fiscal_year, period, reported_currency, 
    {kpi_cols}
    FROM prep2
    WHERE symbol IN (SELECT symbol FROM prep1)
),
prep4 as (
    SELECT hmc.*, 
    {prep3_kpi_cols}
    FROM raw.historical_market_cap hmc
    INNER JOIN prep3
    ON hmc.symbol = prep3.symbol
    AND hmc.year = prep3.fiscal_year
    AND hmc.quarter = prep3.period
    WHERE hmc.last_quarter_date= true
),
prep5 as (
    SELECT 
    p.*,
    RANK() OVER (
    PARTITION BY p.year, p.quarter 
    ORDER BY p.market_cap_eur DESC
    ) AS mcap_rank
    FROM prep4 p
    ORDER BY p.date DESC, p.market_cap_eur desc
),
prep6 as (
    SELECT * from prep5
    WHERE mcap_rank <= 100
    ORDER BY year DESC, quarter desc, mcap_rank asc
),
prep7 as (
    SELECT hpv.*, 
    prep6.market_cap, 
    prep6.market_cap_eur,
    {prep6_kpi_cols},
    prep6.mcap_rank
    FROM raw.historical_price_volume hpv
    INNER JOIN prep6
    ON hpv.symbol = prep6.symbol
    AND hpv.year = prep6.year
    AND hpv.quarter = prep6.quarter
    WHERE hpv.
)
select * FROM prep7
--WHERE last_quarter_date = true
ORDER BY date DESC, mcap_rank ASC"""

df = run_query(query)

constituents_per_day = (
    df.loc[df["last_quarter_date"] == True]
      .groupby(["year", "quarter"])["symbol"]
      .nunique()
      .reset_index(name="unique_symbol_count")
      .sort_values(["year", "quarter"], ascending=[False, False])  # both descending
)


print(constituents_per_day)


print(f"Read {len(df)} records from financial_metrics table")
df[["date", "symbol", "currency", "close", "volume", "year", "quarter", "last_quarter_date", "close_eur", "market_cap_eur"]].head(101)
#df.head()
#print(constituents_per_day)


  df = pd.read_sql_query(query, conn)


    year quarter  unique_symbol_count
46  2025      Q2                  100
45  2025      Q1                  100
44  2024      Q4                  100
43  2024      Q3                  100
42  2024      Q2                  100
41  2024      Q1                  100
40  2023      Q4                  100
39  2023      Q3                  100
38  2023      Q2                  100
37  2023      Q1                  100
36  2022      Q4                  100
35  2022      Q3                  100
34  2022      Q2                  100
33  2022      Q1                  100
32  2021      Q4                  100
31  2021      Q3                  100
30  2021      Q2                  100
29  2021      Q1                  100
28  2020      Q4                  100
27  2020      Q3                  100
26  2020      Q2                  100
25  2020      Q1                  100
24  2019      Q4                  100
23  2019      Q3                  100
22  2019      Q2                  100
21  2019    

Unnamed: 0,date,symbol,currency,close,volume,year,quarter,last_quarter_date,close_eur,market_cap_eur
0,2025-06-30,NVDA,USD,157.99,194580316.0,2025,Q2,True,134.0398,3276066000000.0
1,2025-06-30,MSFT,USD,497.41,28369000.0,2025,Q2,True,422.006,3136770000000.0
2,2025-06-30,AAPL,USD,205.17,91912816.0,2025,Q2,True,174.0676,2609984000000.0
3,2025-06-30,ORCL,USD,218.63,31844231.0,2025,Q2,True,185.4872,520291500000.0
4,2025-06-30,CSCO,USD,69.38,32561232.0,2025,Q2,True,58.8625,233095300000.0
5,2025-06-30,IBM,USD,294.78,3495400.0,2025,Q2,True,250.0933,232086600000.0
6,2025-06-30,CRM,USD,272.69,8541909.0,2025,Q2,True,231.352,224222200000.0
7,2025-06-30,UBER,USD,93.3,16155300.0,2025,Q2,True,79.1563,165631800000.0
8,2025-06-30,TXN,USD,207.62,5319900.0,2025,Q2,True,176.1462,160293000000.0
9,2025-06-30,QCOM,USD,159.26,7988244.0,2025,Q2,True,135.1172,149169400000.0


In [53]:
#df.loc[df.close_eur == df.close_eur.max()]
df.loc[df.symbol == 'FRGT']

Unnamed: 0,date,symbol,currency,close,volume,year,quarter,last_quarter_date,close_eur,volume_eur,close_usd,volume_usd,created_at,market_cap,market_cap_eur,price_to_earnings_ratio_perc,mcap_rank
175543,2018-06-29,FRGT,USD,9029051.04,1.0,2018,Q2,True,7729359.0,1.0,9029051.04,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
175643,2018-06-28,FRGT,USD,8800880.0,1.0,2018,Q2,False,7608479.0,1.0,8800880.0,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
175743,2018-06-27,FRGT,USD,8197856.8,1.0,2018,Q2,False,7095930.0,1.0,8197856.8,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
175843,2018-06-26,FRGT,USD,8931263.36,1.0,2018,Q2,False,7668822.0,1.0,8931263.36,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
175943,2018-06-25,FRGT,USD,8866071.72,1.0,2018,Q2,False,7575897.0,1.0,8866071.72,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
176043,2018-06-22,FRGT,USD,9311548.44,2.0,2018,Q2,False,7989248.0,2.0,9311548.44,2.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
176143,2018-06-21,FRGT,USD,9094242.76,1.0,2018,Q2,False,7838175.0,1.0,9094242.76,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
176243,2018-06-20,FRGT,USD,8876937.08,1.0,2018,Q2,False,7672441.0,1.0,8876937.08,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
176343,2018-06-19,FRGT,USD,8860639.04,1.0,2018,Q2,False,7646656.0,1.0,8860639.04,1.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77
176443,2018-06-18,FRGT,USD,9197462.92,0.0,2018,Q2,False,0.0,0.0,0.0,0.0,2025-08-24 12:14:14.056826,3675726000.0,3146622000.0,100,77


In [None]:
import pandas as pd

# Step 0: Parse the date column
df["date"] = pd.to_datetime(df["date"])

# Step 1: Get rebalance points (end of previous quarter)
rebalance_df = df[df["last_quarter_date"] == True].copy()

# Ensure the data is sorted
df = df.sort_values(["symbol", "date"]).reset_index(drop=True)

# Step 2: Forward fill close prices per symbol
df["close"] = df.groupby("symbol")["close"].ffill()

# Step 3: Create a dictionary of rebalancing dates with weights
rebalance_weights = {}

for (year, quarter), group in rebalance_df.groupby(["year", "quarter"]):
    date = group["date"].iloc[0]  # rebalance date
    total_mcap = group["market_cap_eur"].sum()
    weights = group.set_index("symbol")["market_cap_eur"] / total_mcap
    rebalance_weights[(year, quarter)] = {
        "date": date,
        "weights": weights.to_dict()
    }

# Step 4: Create daily index values
index_values = []
initial_value = 1000  # Start index value
current_value = initial_value
previous_prices = {}

# Generate list of all dates
all_dates = df["date"].sort_values().unique()

# Create a symbol-price pivot table
price_pivot = df.pivot(index="date", columns="symbol", values="close")

for current_date in all_dates:
    # Find current year, quarter
    y = current_date.year
    m = current_date.month
    q = {1: "Q1", 2: "Q1", 3: "Q1",
         4: "Q2", 5: "Q2", 6: "Q2",
         7: "Q3", 8: "Q3", 9: "Q3",
         10: "Q4", 11: "Q4", 12: "Q4"}[m]

    # Check if we rebalance today
    if (y, q) in rebalance_weights and rebalance_weights[(y, q)]["date"] == current_date:
        # Reset weights based on current prices
        active_weights = rebalance_weights[(y, q)]["weights"]
        previous_prices = price_pivot.loc[current_date, active_weights.keys()].to_dict()
        index_value_today = current_value
    else:
        # Use previous weights
        if not previous_prices:
            index_values.append((current_date, current_value))
            continue  # wait for first rebalance
        returns = {}
        for symbol, prev_price in previous_prices.items():
            try:
                current_price = price_pivot.loc[current_date, symbol]
                if pd.notna(current_price) and prev_price > 0:
                    returns[symbol] = current_price / prev_price
                else:
                    returns[symbol] = 1.0
            except KeyError:
                returns[symbol] = 1.0
        index_return = sum(active_weights[s] * returns.get(s, 1.0) for s in active_weights)
        current_value *= index_return
        # update previous prices
        for s in previous_prices:
            if not pd.isna(price_pivot.loc[current_date, s]):
                previous_prices[s] = price_pivot.loc[current_date, s]
    index_values.append((current_date, current_value))

# Step 5: Convert to DataFrame
index_df = pd.DataFrame(index_values, columns=["date", "index_value"])
index_df = index_df.sort_values("date", ascending=False).reset_index(drop=True)
index_df

Unnamed: 0,date,index_value
0,2025-06-30,6.273446e+16
1,2025-06-27,6.273446e+16
2,2025-06-26,6.258689e+16
3,2025-06-25,6.224908e+16
4,2025-06-24,6.156758e+16
5,2025-06-23,6.076353e+16
6,2025-06-20,6.021855e+16
7,2025-06-18,6.019977e+16
8,2025-06-17,5.999183e+16
9,2025-06-16,6.04788e+16
