In [None]:
import psycopg2
import pandas as pd
from utils.utils import get_postgres_connection


def read_financial_metrics():
    """Simple function to read financial_metrics table and return as pandas DataFrame"""
    conn = get_postgres_connection()
    query = "SELECT * FROM raw.financial_metrics"
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df


df = read_financial_metrics()
print(f"Read {len(df)} records from financial_metrics table")
print(df.head())


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1079a5490>>
Traceback (most recent call last):
  File "/Users/aleksamihajlovic/Documents/naro-index-advisor/etl-service/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [32]:
df.columns

Index(['symbol', 'date', 'fiscal_year', 'period', 'reported_currency',
       'gross_profit_margin', 'ebit_margin', 'ebitda_margin',
       'operating_profit_margin', 'pretax_profit_margin',
       'continuous_operations_profit_margin', 'net_profit_margin',
       'bottom_line_profit_margin', 'current_ratio', 'quick_ratio',
       'solvency_ratio', 'cash_ratio', 'receivables_turnover',
       'payables_turnover', 'inventory_turnover', 'fixed_asset_turnover',
       'asset_turnover', 'working_capital_turnover_ratio',
       'price_to_earnings_ratio', 'price_to_earnings_growth_ratio',
       'forward_price_to_earnings_growth_ratio', 'price_to_book_ratio',
       'price_to_sales_ratio', 'price_to_free_cash_flow_ratio',
       'price_to_operating_cash_flow_ratio', 'price_to_fair_value',
       'debt_to_assets_ratio', 'debt_to_equity_ratio', 'debt_to_capital_ratio',
       'long_term_debt_to_capital_ratio', 'financial_leverage_ratio',
       'debt_to_market_cap', 'operating_cash_flow_ratio'

In [31]:
import pandas as pd
import numpy as np
from IPython.display import display_html

def _safe_numeric_series(s: pd.Series) -> pd.Series:
    """Keep only finite numeric values."""
    s = pd.to_numeric(s, errors="coerce")
    s = s.replace([np.inf, -np.inf], np.nan).dropna()
    return s

def _summary_with_custom_percentiles(s: pd.Series, percentiles=None) -> pd.DataFrame:
    if percentiles is None:
        # include 5th, 95th and deciles 10..90
        percentiles = [0.001, 0.01,0.05] + [i/100 for i in range(10, 100, 10)] + [0.95, 0.99, 0.999]
    desc = s.describe(percentiles=percentiles)
    out = desc.reset_index()
    out.columns = ["Statistic", "Value"]
    return out

def _bucket_counts_qcut(s: pd.Series, q=10) -> pd.DataFrame:
    """Counts per percentile bucket, safe against duplicates and constants."""
    n = len(s)
    if n == 0:
        return pd.DataFrame({"Percentile Bucket": [], "Count": []})
    if s.nunique(dropna=True) < 2:
        return pd.DataFrame({"Percentile Bucket": ["All values equal"], "Count": [n]})

    try:
        buckets = pd.qcut(s, q=q, labels=[f"{i*100//q}-{(i+1)*100//q}%" for i in range(q)], duplicates="drop")
        counts = buckets.value_counts().sort_index().reset_index()
        counts.columns = ["Percentile Bucket", "Count"]
        return counts
    except Exception:
        qs = np.linspace(0, 1, q + 1)
        edges = np.unique(s.quantile(qs).values)
        if len(edges) < 2:
            return pd.DataFrame({"Percentile Bucket": ["All values equal"], "Count": [n]})
        labels = [f"{int(qs[i]*100)}-{int(qs[i+1]*100)}%" for i in range(len(edges)-1)]
        buckets = pd.cut(s, bins=edges, include_lowest=True, labels=labels, duplicates="drop")
        counts = buckets.value_counts().sort_index().reset_index()
        counts.columns = ["Percentile Bucket", "Count"]
        return counts

def _display_side_by_side(dfs: list, titles: list):
    html = ""
    for df, title in zip(dfs, titles):
        html += (
            "<div style='display:inline-block; padding-right:30px; vertical-align:top;'>"
            f"<h3 style='margin:4px 0 8px 0;'>{title}</h3>"
            f"{df.to_html(index=False)}"
            "</div>"
        )
    display_html(html, raw=True)

def describe_all_numeric_with_buckets(df: pd.DataFrame, q=10, percentiles=None, max_cols=None):
    """
    For each numeric column:
      - show summary with custom percentiles (5th, 10..90, 95th by default)
      - show counts per percentile bucket
    """
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if max_cols is not None:
        num_cols = num_cols[:max_cols]

    if not num_cols:
        print("No numeric columns found.")
        return

    for col in num_cols:
        s = _safe_numeric_series(df[col])
        if s.empty:
            print(f"ðŸ“­ Column: {col} â€” skipped (no finite numeric values).")
            continue

        stats = _summary_with_custom_percentiles(s, percentiles=percentiles)
        buckets = _bucket_counts_qcut(s, q=q)

        print(f"ðŸ“Š Column: {col} (n={len(s)})")
        _display_side_by_side(
            [stats, buckets],
            ["Summary Statistics", f"Counts per Percentile Bucket ({q} buckets)"]
        )
        print()  # spacing


# ---- Run it ----
# This will include 5th, 10..90, 95th percentiles + decile bucket counts
describe_all_numeric_with_buckets(df, q=10)


ðŸ“Š Column: gross_profit_margin (n=1557417)


Statistic,Value
count,1557417.0
mean,-2436.884422
std,2346429.840695
min,-2882786194.286195
0.1%,-138.667394
1%,-1.857993
5%,0.0
10%,0.0
20%,0.077175
30%,0.157231

Percentile Bucket,Count
0-10%,237794
10-20%,73692
20-30%,155742
30-40%,155740
40-50%,155743
50-60%,155740
60-70%,155741
70-80%,155741
80-90%,155743
90-100%,155741



ðŸ“Š Column: ebit_margin (n=1557413)


Statistic,Value
count,1557413.0
mean,-5928.403458
std,4323168.063856
min,-3882786195.286195
0.1%,-4999.881298
1%,-56.557902
5%,-1.692778
10%,-0.313041
20%,-0.018701
30%,0.0

Percentile Bucket,Count
0-10%,155742
10-20%,155741
20-30%,234193
30-40%,77291
40-50%,155745
50-60%,155740
60-70%,155741
70-80%,155737
80-90%,155741
90-100%,155742



ðŸ“Š Column: ebitda_margin (n=1557391)


Statistic,Value
count,1557391.0
mean,-10296.144125
std,5284908.535616
min,-3831439393.939394
0.1%,-4030.853036
1%,-52.807521
5%,-1.531197
10%,-0.236546
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,155740
10-20%,321318
20-30%,145900
30-40%,155738
40-50%,155741
50-60%,155739
60-70%,155737
70-80%,155739
80-90%,155739



ðŸ“Š Column: operating_profit_margin (n=1557411)


Statistic,Value
count,1557411.0
mean,-8668.794156
std,5157820.355513
min,-3882786195.286195
0.1%,-3294.59811
1%,-57.290839
5%,-1.817782
10%,-0.342389
20%,-0.025445
30%,0.0

Percentile Bucket,Count
0-10%,155742
10-20%,155741
20-30%,218658
30-40%,92824
40-50%,155742
50-60%,155741
60-70%,155740
70-80%,155742
80-90%,155740
90-100%,155741



ðŸ“Š Column: pretax_profit_margin (n=1557219)


Statistic,Value
count,1557219.0
mean,-6904.814862
std,4552043.429496
min,-3855218855.218855
0.1%,-3887.460469
1%,-63.73663
5%,-2.06639
10%,-0.412338
20%,-0.04214
30%,0.0

Percentile Bucket,Count
0-10%,155722
10-20%,155722
20-30%,246033
30-40%,65418
40-50%,155716
50-60%,155722
60-70%,155721
70-80%,155723
80-90%,155720
90-100%,155722



ðŸ“Š Column: continuous_operations_profit_margin (n=1557409)


Statistic,Value
count,1557409.0
mean,-7271.985768
std,4571794.229289
min,-3855218855.218855
0.1%,-5285.960918
1%,-64.222625
5%,-1.978887
10%,-0.392443
20%,-0.040382
30%,0.0

Percentile Bucket,Count
0-10%,155741
10-20%,155741
20-30%,260880
30-40%,50606
40-50%,155738
50-60%,155742
60-70%,155741
70-80%,155740
80-90%,155741
90-100%,155739



ðŸ“Š Column: net_profit_margin (n=1557412)


Statistic,Value
count,1557412.0
mean,-9281.732537
std,5297490.20071
min,-3855218855.218855
0.1%,-3782.70065
1%,-62.981909
5%,-2.005582
10%,-0.39534
20%,-0.041467
30%,0.0

Percentile Bucket,Count
0-10%,155742
10-20%,155741
20-30%,254645
30-40%,56842
40-50%,155739
50-60%,155743
60-70%,155737
70-80%,155740
80-90%,155741
90-100%,155742



ðŸ“Š Column: bottom_line_profit_margin (n=1557412)


Statistic,Value
count,1557412.0
mean,-7390.243702
std,4571690.829176
min,-3855218855.218855
0.1%,-5558.852052
1%,-65.867284
5%,-2.010701
10%,-0.397354
20%,-0.042078
30%,0.0

Percentile Bucket,Count
0-10%,155742
10-20%,155742
20-30%,256904
30-40%,54579
40-50%,155745
50-60%,155735
60-70%,155743
70-80%,155741
80-90%,155742
90-100%,155739



ðŸ“Š Column: current_ratio (n=1557409)


Statistic,Value
count,1557409.0
mean,407.246766
std,211227.126547
min,-19393.524
0.1%,0.0
1%,0.0
5%,0.0
10%,0.0
20%,0.489735
30%,0.955807

Percentile Bucket,Count
0-10%,199738
10-20%,111744
20-30%,155741
30-40%,155741
40-50%,155741
50-60%,155740
60-70%,155741
70-80%,155741
80-90%,155741
90-100%,155741



ðŸ“Š Column: quick_ratio (n=1557396)


Statistic,Value
count,1557396.0
mean,491.074381
std,222277.771716
min,-191420.868588
0.1%,-1.596835
1%,0.0
5%,0.0
10%,0.0
20%,0.321628
30%,0.6403

Percentile Bucket,Count
0-10%,202743
10-20%,108737
20-30%,155739
30-40%,155740
40-50%,155740
50-60%,155739
60-70%,155739
70-80%,155741
80-90%,155738
90-100%,155740



ðŸ“Š Column: solvency_ratio (n=1536694)


Statistic,Value
count,1536694.0
mean,-2421.908928
std,2444150.828611
min,-2611722988.239018
0.1%,-82.79398
1%,-3.98901
5%,-0.668445
10%,-0.167825
20%,-0.014294
30%,0.0

Percentile Bucket,Count
0-10%,153670
10-20%,153669
20-30%,251449
30-40%,55905
40-50%,153660
50-60%,153665
60-70%,153673
70-80%,153665
80-90%,153668
90-100%,153670



ðŸ“Š Column: cash_ratio (n=1557410)


Statistic,Value
count,1557410.0
mean,-3738.275906
std,4716212.050464
min,-5885647325.69374
0.1%,-0.156728
1%,0.0
5%,0.0
10%,0.0
20%,0.029929
30%,0.104227

Percentile Bucket,Count
0-10%,207256
10-20%,104230
20-30%,155739
30-40%,155740
40-50%,155740
50-60%,155741
60-70%,155741
70-80%,155741
80-90%,155741
90-100%,155741



ðŸ“Š Column: receivables_turnover (n=1545739)


Statistic,Value
count,1545739.0
mean,428000.986269
std,44414139.325541
min,-9241483286.0
0.1%,-78.537707
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,516992
10-20%,101304
20-30%,154574
30-40%,154573
40-50%,154574
50-60%,154574
60-70%,154574
70-80%,154574



ðŸ“Š Column: payables_turnover (n=1539388)


Statistic,Value
count,1539388.0
mean,7920.778435
std,7755296.301785
min,-2818284.790878
0.1%,-2.986233
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.052212

Percentile Bucket,Count
0-10%,426880
10-20%,34937
20-30%,153938
30-40%,153939
40-50%,153939
50-60%,153939
60-70%,153939
70-80%,153938
80-90%,153939



ðŸ“Š Column: inventory_turnover (n=1543884)


Statistic,Value
count,1543884.0
mean,22629.497711
std,16207145.804561
min,-9150685977.2296
0.1%,-245.389194
1%,-0.902299
5%,0.0
10%,0.0
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,575648
10-20%,41907
20-30%,154387
30-40%,154388
40-50%,154389
50-60%,154388
60-70%,154388
70-80%,154389



ðŸ“Š Column: fixed_asset_turnover (n=1539764)


Statistic,Value
count,1539764.0
mean,411.336613
std,449570.348079
min,-4059429.319372
0.1%,-11.268483
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.152822

Percentile Bucket,Count
0-10%,334642
10-20%,127287
20-30%,153978
30-40%,153975
40-50%,153977
50-60%,153976
60-70%,153976
70-80%,153976
80-90%,153977



ðŸ“Š Column: asset_turnover (n=1536580)


Statistic,Value
count,1536580.0
mean,1938.468667
std,2323557.986223
min,-372313.756249
0.1%,-0.160125
1%,0.0
5%,0.0
10%,0.0
20%,0.000794
30%,0.026157

Percentile Bucket,Count
0-10%,294984
10-20%,12337
20-30%,153654
30-40%,153657
40-50%,153664
50-60%,153653
60-70%,153657
70-80%,153659
80-90%,153657
90-100%,153658



ðŸ“Š Column: working_capital_turnover_ratio (n=1552265)


Statistic,Value
count,1552265.0
mean,-6725.560013
std,6068565.527972
min,-5344170720.0
0.1%,-258.832961
1%,-22.050811
5%,-3.042164
10%,-0.665053
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,155227
10-20%,395372
20-30%,70307
30-40%,155227
40-50%,155226
50-60%,155227
60-70%,155226
70-80%,155226
80-90%,155227



ðŸ“Š Column: price_to_earnings_ratio (n=1552894)


Statistic,Value
count,1552894.0
mean,-26376.799584
std,15384662.483807
min,-6464285694.486983
0.1%,-54974.615838
1%,-930.077367
5%,-73.301127
10%,-24.445054
20%,-5.715671
30%,-0.697282

Percentile Bucket,Count
0-10%,155290
10-20%,155289
20-30%,155289
30-40%,155290
40-50%,155289
50-60%,155289
60-70%,155290
70-80%,155289
80-90%,155289
90-100%,155290



ðŸ“Š Column: price_to_earnings_growth_ratio (n=1555539)


Statistic,Value
count,1555539.0
mean,-2565.291191
std,16390992.955534
min,-7351238930.523371
0.1%,-400.035871
1%,-17.238243
5%,-3.106151
10%,-1.398849
20%,-0.496535
30%,-0.133593

Percentile Bucket,Count
0-10%,155555
10-20%,155553
20-30%,155554
30-40%,155554
40-50%,218169
50-60%,92938
60-70%,155556
70-80%,155552
80-90%,155554
90-100%,155554



ðŸ“Š Column: forward_price_to_earnings_growth_ratio (n=1555539)


Statistic,Value
count,1555539.0
mean,-2565.291191
std,16390992.955534
min,-7351238930.523371
0.1%,-400.035871
1%,-17.238243
5%,-3.106151
10%,-1.398849
20%,-0.496535
30%,-0.133593

Percentile Bucket,Count
0-10%,155555
10-20%,155553
20-30%,155554
30-40%,155554
40-50%,218169
50-60%,92938
60-70%,155556
70-80%,155552
80-90%,155554
90-100%,155554



ðŸ“Š Column: price_to_book_ratio (n=1555619)


Statistic,Value
count,1555619.0
mean,10361.601746
std,20984488.813528
min,-9839763654.019672
0.1%,-3503.140808
1%,-21.805382
5%,0.0
10%,0.0
20%,0.41819
30%,0.734172

Percentile Bucket,Count
0-10%,201218
10-20%,109906
20-30%,155562
30-40%,155562
40-50%,155562
50-60%,155561
60-70%,155562
70-80%,155562
80-90%,155562
90-100%,155562



ðŸ“Š Column: price_to_sales_ratio (n=1554527)


Statistic,Value
count,1554527.0
mean,124295.215564
std,26924389.640828
min,-7579577800.000001
0.1%,-1372.333932
1%,0.0
5%,0.0
10%,0.0
20%,0.87125
30%,1.779552

Percentile Bucket,Count
0-10%,178994
10-20%,131912
20-30%,155452
30-40%,155453
40-50%,155453
50-60%,155452
60-70%,155453
70-80%,155452
80-90%,155453
90-100%,155453



ðŸ“Š Column: price_to_free_cash_flow_ratio (n=1553382)


Statistic,Value
count,1553382.0
mean,-168229.868824
std,29469952.849808
min,-9907488694.278948
0.1%,-385939.04365
1%,-5714.587558
5%,-368.450987
10%,-127.226823
20%,-37.415513
30%,-12.39752

Percentile Bucket,Count
0-10%,155339
10-20%,155338
20-30%,155340
30-40%,441383
40-50%,24629
50-60%,155338
60-70%,155338
70-80%,155338
80-90%,155339



ðŸ“Š Column: price_to_operating_cash_flow_ratio (n=1555617)


Statistic,Value
count,1555617.0
mean,-1768474.218541
std,105939362.630315
min,-9973636422.396872
0.1%,-785919.868483
1%,-4488.319858
5%,-265.243369
10%,-85.678961
20%,-18.463469
30%,0.0

Percentile Bucket,Count
0-10%,155562
10-20%,155562
20-30%,463324
30-40%,3361
40-50%,155561
50-60%,155562
60-70%,155561
70-80%,155562
80-90%,155562



ðŸ“Š Column: price_to_fair_value (n=1555619)


Statistic,Value
count,1555619.0
mean,10361.601746
std,20984488.813528
min,-9839763654.019672
0.1%,-3503.140808
1%,-21.805382
5%,0.0
10%,0.0
20%,0.41819
30%,0.734172

Percentile Bucket,Count
0-10%,201218
10-20%,109906
20-30%,155562
30-40%,155562
40-50%,155562
50-60%,155561
60-70%,155562
70-80%,155562
80-90%,155562
90-100%,155562



ðŸ“Š Column: debt_to_assets_ratio (n=1557376)


Statistic,Value
count,1557376.0
mean,111.889435
std,77243.558352
min,-52757.0
0.1%,0.0
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.009672

Percentile Bucket,Count
0-10%,374648
10-20%,92566
20-30%,155742
30-40%,155733
40-50%,155737
50-60%,155737
60-70%,155738
70-80%,155739
80-90%,155736



ðŸ“Š Column: debt_to_equity_ratio (n=1557374)


Statistic,Value
count,1557374.0
mean,792.920316
std,364080.934283
min,-31736599.382319
0.1%,-56.469075
1%,-3.53577
5%,0.0
10%,0.0
20%,0.0
30%,0.003028

Percentile Bucket,Count
0-10%,430493
10-20%,36720
20-30%,155739
30-40%,155736
40-50%,155736
50-60%,155738
60-70%,155737
70-80%,155737
80-90%,155738



ðŸ“Š Column: debt_to_capital_ratio (n=1557426)


Statistic,Value
count,1557426.0
mean,-1.103048
std,1787.800887
min,-2230404.412026
0.1%,-12.058776
1%,-0.199465
5%,0.0
10%,0.0
20%,0.0
30%,0.00921

Percentile Bucket,Count
0-10%,395708
10-20%,71522
20-30%,155741
30-40%,155742
40-50%,155743
50-60%,155742
60-70%,155743
70-80%,155743
80-90%,155742



ðŸ“Š Column: long_term_debt_to_capital_ratio (n=1556919)


Statistic,Value
count,1556919.0
mean,-0.512221
std,851.174981
min,-1061901.785608
0.1%,-5.826938
1%,-0.028738
5%,0.0
10%,0.0
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,675653
10-20%,102807
20-30%,155692
30-40%,155691
40-50%,155693
50-60%,155692
60-70%,155691



ðŸ“Š Column: financial_leverage_ratio (n=1557423)


Statistic,Value
count,1557423.0
mean,609.672402
std,383693.003135
min,-95830342.01243
0.1%,-109.438023
1%,-7.402523
5%,0.0
10%,0.0
20%,1.063889
30%,1.245732

Percentile Bucket,Count
0-10%,240458
10-20%,71028
20-30%,155741
30-40%,155745
40-50%,155740
50-60%,155742
60-70%,155742
70-80%,155742
80-90%,155742
90-100%,155743



ðŸ“Š Column: debt_to_market_cap (n=1472825)


Statistic,Value
count,1472825.0
mean,34278.480661
std,12426544.738351
min,-214.950613
0.1%,-0.065216
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.002219

Percentile Bucket,Count
0-10%,393302
10-20%,48547
20-30%,147282
30-40%,147283
40-50%,147281
50-60%,147282
60-70%,147283
70-80%,147282
80-90%,147283



ðŸ“Š Column: operating_cash_flow_ratio (n=1509451)


Statistic,Value
count,1509451.0
mean,-18376.547128
std,12594263.323198
min,-9897015430.037952
0.1%,-87.087298
1%,-4.811638
5%,-0.99761
10%,-0.302899
20%,-0.047438
30%,0.0

Percentile Bucket,Count
0-10%,150946
10-20%,150946
20-30%,485444
30-40%,118336
40-50%,150945
50-60%,150944
60-70%,150945
70-80%,150945



ðŸ“Š Column: operating_cash_flow_sales_ratio (n=1512320)


Statistic,Value
count,1512320.0
mean,-8508.444915
std,4763115.626
min,-3788720538.720539
0.1%,-1894.212471
1%,-34.641156
5%,-1.455534
10%,-0.319318
20%,-0.034258
30%,0.0

Percentile Bucket,Count
0-10%,151232
10-20%,151232
20-30%,434726
30-40%,18971
40-50%,151237
50-60%,151226
60-70%,151232
70-80%,151232
80-90%,151232



ðŸ“Š Column: free_cash_flow_operating_cash_flow_ratio (n=1557414)


Statistic,Value
count,1557414.0
mean,174321.624611
std,21468352.221288
min,-58156962.966828
0.1%,-97.685401
1%,-7.995666
5%,-0.825416
10%,0.0
20%,0.0
30%,0.114537

Percentile Bucket,Count
0-10%,449245
10-20%,17979
20-30%,155742
30-40%,155741
40-50%,155742
50-60%,252098
60-70%,59385
70-80%,155740
80-90%,155742



ðŸ“Š Column: debt_service_coverage_ratio (n=1557376)


Statistic,Value
count,1557376.0
mean,16274.146288
std,6430266.096172
min,-2051242704.0
0.1%,-11623.852273
1%,-370.66315
5%,-6.964301
10%,-0.633213
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,155738
10-20%,473607
20-30%,149343
30-40%,155740
40-50%,155735
50-60%,155738
60-70%,155737
70-80%,155738



ðŸ“Š Column: interest_coverage_ratio (n=1557378)


Statistic,Value
count,1557378.0
mean,24094.248723
std,8876739.123467
min,-2051242704.0
0.1%,-21944.864648
1%,-904.625332
5%,-44.6
10%,-7.973922
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,155738
10-20%,613910
20-30%,9041
30-40%,155738
40-50%,155737
50-60%,155738
60-70%,155738
70-80%,155738



ðŸ“Š Column: short_term_operating_cash_flow_coverage_ratio (n=1525311)


Statistic,Value
count,1525311.0
mean,782.658336
std,530210.651625
min,-64944160.069996
0.1%,-479.300153
1%,-21.762231
5%,-1.099873
10%,-0.206332
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,152532
10-20%,826943
20-30%,88244
30-40%,152530
40-50%,152531
50-60%,152531



ðŸ“Š Column: operating_cash_flow_coverage_ratio (n=1518142)


Statistic,Value
count,1518142.0
mean,-232.056736
std,438605.338569
min,-414207772.066667
0.1%,-365.642285
1%,-15.104814
5%,-0.848356
10%,-0.192299
20%,-0.009807
30%,0.0

Percentile Bucket,Count
0-10%,151815
10-20%,151816
20-30%,549186
30-40%,58069
40-50%,151814
50-60%,151814
60-70%,151813
70-80%,151815



ðŸ“Š Column: capital_expenditure_coverage_ratio (n=1557398)


Statistic,Value
count,1557398.0
mean,44152.719467
std,19361682.514577
min,-5192028736.0
0.1%,-52746.034
1%,-668.977684
5%,-44.242362
10%,-8.661785
20%,-0.587286
30%,0.0

Percentile Bucket,Count
0-10%,155740
10-20%,155740
20-30%,534140
30-40%,88822
40-50%,155736
50-60%,155740
60-70%,155740
70-80%,155740



ðŸ“Š Column: dividend_paid_and_capex_coverage_ratio (n=1557391)


Statistic,Value
count,1557391.0
mean,-6265.683065
std,10561337.79061
min,-5192028736.0
0.1%,-44959.382333
1%,-520.185027
5%,-30.266233
10%,-6.034483
20%,-0.504074
30%,0.0

Percentile Bucket,Count
0-10%,155740
10-20%,155739
20-30%,501199
30-40%,121757
40-50%,155739
50-60%,155739
60-70%,155739
70-80%,155739



ðŸ“Š Column: dividend_payout_ratio (n=1515394)


Statistic,Value
count,1515394.0
mean,1822.266619
std,411442.529527
min,-125293354.134897
0.1%,-53.148624
1%,-3.024987
5%,-0.054491
10%,0.0
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,998794
10-20%,61983
20-30%,151540
30-40%,151537
40-50%,151540



ðŸ“Š Column: dividend_yield (n=1474920)


Statistic,Value
count,1474920.0
mean,941.297465
std,600937.23488
min,0.0
0.1%,0.0
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,885229
10-20%,147240
20-30%,147482
30-40%,147482
40-50%,147487



ðŸ“Š Column: dividend_yield_percentage (n=1475311)


Statistic,Value
count,1475311.0
mean,16089.802785
std,6791050.292702
min,0.0
0.1%,0.0
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,885188
10-20%,147529
20-30%,147532
30-40%,147531
40-50%,147531



ðŸ“Š Column: dividend_per_share (n=1514106)


Statistic,Value
count,1514106.0
mean,42747.903567
std,7276882.377902
min,0.0
0.1%,0.0
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,908640
10-20%,151234
20-30%,151412
30-40%,151409
40-50%,151411



ðŸ“Š Column: revenue_per_share (n=1557283)


Statistic,Value
count,1557283.0
mean,392003.013085
std,49988398.648845
min,-578822.993718
0.1%,-3.098752
1%,0.0
5%,0.0
10%,0.0
20%,0.073267
30%,0.370392

Percentile Bucket,Count
0-10%,179733
10-20%,131725
20-30%,155727
30-40%,155728
40-50%,155729
50-60%,155728
60-70%,155728
70-80%,155728
80-90%,155728
90-100%,155729



ðŸ“Š Column: net_income_per_share (n=1557382)


Statistic,Value
count,1557382.0
mean,24098.101021
std,16077436.051355
min,-9044715447.15447
0.1%,-6870.038409
1%,-114.972915
5%,-3.120002
10%,-0.542372
20%,-0.058502
30%,-0.002843

Percentile Bucket,Count
0-10%,155739
10-20%,155738
20-30%,155744
30-40%,155732
40-50%,155738
50-60%,155738
60-70%,155738
70-80%,155738
80-90%,155738
90-100%,155739



ðŸ“Š Column: interest_debt_per_share (n=1469882)


Statistic,Value
count,1469882.0
mean,209817.225745
std,22778493.560739
min,-7134899.5
0.1%,-0.061223
1%,0.0
5%,0.0
10%,0.0
20%,0.008378
30%,0.13413

Percentile Bucket,Count
0-10%,193926
10-20%,100052
20-30%,146987
30-40%,146988
40-50%,146988
50-60%,146988
60-70%,146988
70-80%,146988
80-90%,146988
90-100%,146989



ðŸ“Š Column: cash_per_share (n=1469986)


Statistic,Value
count,1469986.0
mean,588458.878105
std,55433128.520655
min,-4133412.890874
0.1%,-196.845413
1%,0.0
5%,0.0
10%,0.007005
20%,0.088488
30%,0.318751

Percentile Bucket,Count
0-10%,146999
10-20%,146999
20-30%,146998
30-40%,146999
40-50%,146998
50-60%,146999
60-70%,146998
70-80%,146999
80-90%,146998
90-100%,146999



ðŸ“Š Column: book_value_per_share (n=1469918)


Statistic,Value
count,1469918.0
mean,645961.420894
std,66311502.011721
min,-9930065359.477123
0.1%,-995.046339
1%,-13.453957
5%,0.0
10%,0.0
20%,0.355538
30%,1.58794

Percentile Bucket,Count
0-10%,161021
10-20%,132963
20-30%,146992
30-40%,146991
40-50%,146992
50-60%,146992
60-70%,146991
70-80%,146992
80-90%,146992
90-100%,146992



ðŸ“Š Column: tangible_book_value_per_share (n=1469924)


Statistic,Value
count,1469924.0
mean,614760.546375
std,61067638.374555
min,-9991503267.973856
0.1%,-2942.212428
1%,-50.452098
5%,-2.62925
10%,-0.004611
20%,0.043925
30%,0.698322

Percentile Bucket,Count
0-10%,146993
10-20%,146993
20-30%,146991
30-40%,146993
40-50%,146992
50-60%,146992
60-70%,146993
70-80%,146992
80-90%,146992
90-100%,146993



ðŸ“Š Column: shareholders_equity_per_share (n=1469919)


Statistic,Value
count,1469919.0
mean,638139.065962
std,66317584.586469
min,-9950326797.38562
0.1%,-1058.068219
1%,-13.092972
5%,0.0
10%,0.028145
20%,0.625983
30%,1.93729

Percentile Bucket,Count
0-10%,146992
10-20%,146992
20-30%,146992
30-40%,146992
40-50%,146992
50-60%,146991
60-70%,146992
70-80%,146992
80-90%,146992
90-100%,146992



ðŸ“Š Column: operating_cash_flow_per_share (n=1505052)


Statistic,Value
count,1505052.0
mean,-23885.135448
std,13256446.338708
min,-9309350916.687704
0.1%,-8448.278177
1%,-150.87821
5%,-1.886406
10%,-0.391108
20%,-0.034227
30%,0.0

Percentile Bucket,Count
0-10%,150506
10-20%,150505
20-30%,422448
30-40%,29071
40-50%,150502
50-60%,150504
60-70%,150506
70-80%,150504
80-90%,150506



ðŸ“Š Column: capex_per_share (n=1505111)


Statistic,Value
count,1505111.0
mean,8220.232517
std,2412844.897794
min,-0.027105
0.1%,0.0
1%,0.0
5%,0.0
10%,0.0
20%,0.0
30%,0.000143

Percentile Bucket,Count
0-10%,415969
10-20%,35689
20-30%,150389
30-40%,150510
40-50%,150511
50-60%,150510
60-70%,150511
70-80%,150511
80-90%,150511



ðŸ“Š Column: free_cash_flow_per_share (n=1505139)


Statistic,Value
count,1505139.0
mean,-31305.760743
std,14677727.368329
min,-9711885152.998062
0.1%,-15220.616783
1%,-399.624834
5%,-4.510655
10%,-0.934656
20%,-0.169993
30%,-0.027006

Percentile Bucket,Count
0-10%,150514
10-20%,150514
20-30%,150515
30-40%,150514
40-50%,256818
50-60%,44209
60-70%,150514
70-80%,150513
80-90%,150514
90-100%,150514



ðŸ“Š Column: net_income_per_ebt (n=1556843)


Statistic,Value
count,1556843.0
mean,791979.517648
std,102314544.324972
min,-9882282380.95238
0.1%,-45.786556
1%,-1.688214
5%,0.0
10%,0.458777
20%,0.64954
30%,0.715553

Percentile Bucket,Count
0-10%,155685
10-20%,155685
20-30%,155686
30-40%,155685
40-50%,155683
50-60%,155682
60-70%,155684
70-80%,238045
80-90%,73324
90-100%,155684



ðŸ“Š Column: ebt_per_ebit (n=1557404)


Statistic,Value
count,1557404.0
mean,2518.682675
std,5791877.164853
min,-3448599150.0
0.1%,-128.162378
1%,-9.52264
5%,-0.679332
10%,0.0
20%,0.589197
30%,0.828136

Percentile Bucket,Count
0-10%,169893
10-20%,141588
20-30%,155740
30-40%,155743
40-50%,155742
50-60%,172684
60-70%,138795
70-80%,155738
80-90%,155740
90-100%,155741



ðŸ“Š Column: effective_tax_rate (n=1556820)


Statistic,Value
count,1556820.0
mean,-1584.142644
std,1815550.187148
min,-2263290476.190476
0.1%,-26.008292
1%,-2.076302
5%,-0.266929
10%,-0.049229
20%,0.0
30%,0.0

Percentile Bucket,Count
0-10%,155684
10-20%,336816
20-30%,130228
30-40%,155682
40-50%,155687
50-60%,155679
60-70%,155681
70-80%,155681
80-90%,155682



ðŸ“Š Column: enterprise_value_multiple (n=1471242)


Statistic,Value
count,1471242.0
mean,-37826.837472
std,27179300.354871
min,-9700800029.078014
0.1%,-225953.778409
1%,-3003.765372
5%,-235.870537
10%,-70.02177
20%,-10.006357
30%,0.068124

Percentile Bucket,Count
0-10%,147125
10-20%,147124
20-30%,147124
30-40%,147124
40-50%,147124
50-60%,147124
60-70%,147124
70-80%,147124
80-90%,147124
90-100%,147125





#### PIPELINE
#### PIPELINE
#### PIPELINE
#### PIPELINE

âœ… Table clean.financial_metrics_perc recreated and populated.


In [51]:
import polars as pl
import numpy as np
import io
import pandas as pd
import psycopg2
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from utils.utils import get_database_url, get_postgres_connection

# --------------------------
# Config & Setup
# --------------------------
DATABASE_URL = get_database_url()
target_table = "clean.financial_metrics_perc"
staging_schema = "stage"

# Metrics to process
metrics = [
    "gross_profit_margin",
    "ebit_margin",
    "ebitda_margin",
    "operating_profit_margin",
    "pretax_profit_margin",
    "continuous_operations_profit_margin",
    "net_profit_margin",
    "bottom_line_profit_margin",
    "current_ratio",
    "quick_ratio",
    "solvency_ratio",
    "cash_ratio",
    "receivables_turnover",
    "payables_turnover",
    "inventory_turnover",
    "fixed_asset_turnover",
    "asset_turnover",
    "working_capital_turnover_ratio",
    "price_to_earnings_ratio",
    "price_to_earnings_growth_ratio",
    "forward_price_to_earnings_growth_ratio",
    "price_to_book_ratio",
    "price_to_sales_ratio",
    "price_to_free_cash_flow_ratio",
    "price_to_operating_cash_flow_ratio",
    "price_to_fair_value",
    "debt_to_assets_ratio",
    "debt_to_equity_ratio",
    "debt_to_capital_ratio",
    "long_term_debt_to_capital_ratio",
    "financial_leverage_ratio",
    "debt_to_market_cap",
    "operating_cash_flow_ratio",
    "operating_cash_flow_sales_ratio",
    "free_cash_flow_operating_cash_flow_ratio",
    "debt_service_coverage_ratio",
    "interest_coverage_ratio",
    "short_term_operating_cash_flow_coverage_ratio",
    "operating_cash_flow_coverage_ratio",
    "capital_expenditure_coverage_ratio",
    "dividend_paid_and_capex_coverage_ratio",
    "dividend_payout_ratio",
    "dividend_yield",
    "dividend_yield_percentage",
    "dividend_per_share",
    "revenue_per_share",
    "net_income_per_share",
    "interest_debt_per_share",
    "cash_per_share",
    "book_value_per_share",
    "tangible_book_value_per_share",
    "shareholders_equity_per_share",
    "operating_cash_flow_per_share",
    "capex_per_share",
    "free_cash_flow_per_share",
    "net_income_per_ebt",
    "ebt_per_ebit",
    "effective_tax_rate",
    "enterprise_value_multiple"

]

# Columns always kept in final table
identity_columns = [
    'symbol',
    'date',
    'fiscal_year',
    'period',
    'reported_currency'
]

# --------------------------
# 1. Create/Populate Target Table
# --------------------------
print("ðŸ“¦ Recreating clean.financial_metrics_perc...")
engine: Engine = create_engine(DATABASE_URL)
with engine.begin() as conn:
    conn.execute(text("DROP TABLE IF EXISTS clean.financial_metrics_perc"))
    conn.execute(text(f"""
        CREATE TABLE clean.financial_metrics_perc AS
        SELECT {', '.join(identity_columns)}
        FROM raw.financial_metrics
        WHERE FALSE
    """))
    conn.execute(text(f"""
        INSERT INTO clean.financial_metrics_perc ({', '.join(identity_columns)})
        SELECT {', '.join(identity_columns)}
        FROM raw.financial_metrics
    """))
print("âœ… clean.financial_metrics_perc initialized.")

# --------------------------
# 2. Load Full Raw Dataset
# --------------------------
def read_financial_metrics():
    conn = get_postgres_connection()
    cur = conn.cursor()
    buf = io.BytesIO()
    cur.copy_expert("COPY raw.financial_metrics TO STDOUT WITH CSV HEADER", buf)
    buf.seek(0)
    df = pl.read_csv(buf)
    cur.close()
    conn.close()
    return df

df_raw = read_financial_metrics()

# --------------------------
# 3. Percentile Processing + Insertion Loop
# --------------------------
percentile_levels = [0.01, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.99]
labels = ["<1%", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "99%", ">99%"]

pg_conn = psycopg2.connect(DATABASE_URL.replace("postgresql+psycopg2", "postgresql"))
cursor = pg_conn.cursor()

for metric in metrics:
    perc_column = f"{metric}_perc"
    staging_table = f"tmp_{perc_column}"

    print(f"\nðŸ“Š Processing: {metric} â†’ {perc_column}")

    # --------------------------
    # Create Percentile Buckets
    # --------------------------
    df_metric = df_raw.select(identity_columns + [metric]).filter(pl.col(metric).is_not_null())

    cutoffs = df_metric.select([
        pl.col(metric).quantile(q, "nearest").alias(f"p{int(q*100):02d}")
        for q in percentile_levels
    ]).row(0)

    bins = [-np.inf] + list(cutoffs) + [np.inf]
    bounds = list(zip(bins[:-1], bins[1:]))

    mapping_df = pl.DataFrame({
        "label": labels,
        "low": [bounds[i][0] for i in range(len(bounds))],
        "high": [bounds[i][1] for i in range(len(bounds))]
    }).with_columns([
        pl.when(pl.col("low") == float("-inf"))
        .then(pl.lit("-âˆž"))
        .otherwise(pl.col("low").round(2).cast(pl.Utf8))
        .alias("low_fmt"),

        pl.when(pl.col("high") == float("inf"))
        .then(pl.lit("+âˆž"))
        .otherwise(pl.col("high").round(2).cast(pl.Utf8))
        .alias("high_fmt")
    ]).with_columns([
        (pl.col("label") + " (" + pl.col("low_fmt") + " â€“ " + pl.col("high_fmt") + ")")
        .alias("bracket_display")
    ])

    df_with_perc = (
        df_metric.lazy()
        .join(mapping_df.lazy(), how="cross")
        .filter(
            (pl.col(metric) >= pl.col("low")) & (pl.col(metric) < pl.col("high")) |
            ((pl.col("label") == ">99%") & (pl.col(metric) == pl.col("high")))
        )
        .select([
            "symbol", "date",
            pl.col("bracket_display").alias(perc_column)
        ])
        .collect()
    )

    # --------------------------
    # Write to Staging Table
    # --------------------------
    df_pd = df_with_perc.to_pandas()
    df_pd["date"] = pd.to_datetime(df_pd["date"])

    cursor.execute(f"SET search_path TO {staging_schema}")
    cursor.execute(f"DROP TABLE IF EXISTS {staging_table}")
    cursor.execute(f"""
        CREATE TABLE {staging_table} (
            symbol TEXT,
            date DATE,
            {perc_column} TEXT
        )
    """)
    pg_conn.commit()

    output = io.StringIO()
    df_pd.to_csv(output, sep="\t", index=False, header=False, na_rep='\\N')
    output.seek(0)

    cursor.copy_from(output, staging_table, sep="\t", null='\\N')
    pg_conn.commit()
    print("ðŸ“¥ Uploaded to staging table.")

    # --------------------------
    # Ensure Target Column Exists
    # --------------------------
    with engine.begin() as conn:
        conn.execute(text(f"""
            ALTER TABLE {target_table}
            ADD COLUMN IF NOT EXISTS {perc_column} VARCHAR(30);
        """))

    # --------------------------
    # Merge into Final Table
    # --------------------------
    with engine.begin() as conn:
        conn.execute(text(f"""
            UPDATE {target_table} t
            SET {perc_column} = s.{perc_column}
            FROM {staging_schema}.{staging_table} s
            WHERE t.symbol = s.symbol
              AND t.date = s.date::date
        """))

    print(f"âœ… Column {perc_column} updated in target table")

    # --------------------------
    # Cleanup staging table
    # --------------------------

    cursor.execute(f"DROP TABLE IF EXISTS {staging_schema}.{staging_table}")
    pg_conn.commit()
    print(f"ðŸ§¹ Dropped staging table: {staging_schema}.{staging_table}")


ðŸ“¦ Recreating clean.financial_metrics_perc...
âœ… clean.financial_metrics_perc initialized.

ðŸ“Š Processing: gross_profit_margin â†’ gross_profit_margin_perc
ðŸ“¥ Uploaded to staging table.
âœ… Column gross_profit_margin_perc updated in target table
ðŸ§¹ Dropped staging table: stage.tmp_gross_profit_margin_perc

ðŸ“Š Processing: ebit_margin â†’ ebit_margin_perc
ðŸ“¥ Uploaded to staging table.
âœ… Column ebit_margin_perc updated in target table
ðŸ§¹ Dropped staging table: stage.tmp_ebit_margin_perc

ðŸ“Š Processing: ebitda_margin â†’ ebitda_margin_perc
ðŸ“¥ Uploaded to staging table.
âœ… Column ebitda_margin_perc updated in target table
ðŸ§¹ Dropped staging table: stage.tmp_ebitda_margin_perc

ðŸ“Š Processing: operating_profit_margin â†’ operating_profit_margin_perc
ðŸ“¥ Uploaded to staging table.
âœ… Column operating_profit_margin_perc updated in target table
ðŸ§¹ Dropped staging table: stage.tmp_operating_profit_margin_perc

ðŸ“Š Processing: pretax_profit_margin â†’ pretax_profit_m

In [1]:
import io
import numpy as np
import pandas as pd
import polars as pl
import psycopg2
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from utils.utils import get_database_url, get_postgres_connection

# --------------------------
# Config
# --------------------------
DATABASE_URL = get_database_url()
engine: Engine = create_engine(DATABASE_URL)
target_table = "clean.financial_metrics_perc"
staging_schema = "stage"

identity_columns = ["symbol", "date", "fiscal_year", "period", "reported_currency"]

metrics = [
    "gross_profit_margin", "ebit_margin", "ebitda_margin",
    "operating_profit_margin", "pretax_profit_margin",
    "continuous_operations_profit_margin", "net_profit_margin",
    "bottom_line_profit_margin", "current_ratio", "quick_ratio",
    "solvency_ratio", "cash_ratio", "receivables_turnover",
    "payables_turnover", "inventory_turnover", "fixed_asset_turnover",
    "asset_turnover", "working_capital_turnover_ratio",
    "price_to_earnings_ratio", "price_to_earnings_growth_ratio",
    "forward_price_to_earnings_growth_ratio", "price_to_book_ratio",
    "price_to_sales_ratio", "price_to_free_cash_flow_ratio",
    "price_to_operating_cash_flow_ratio", "price_to_fair_value",
    "debt_to_assets_ratio", "debt_to_equity_ratio",
    "debt_to_capital_ratio", "long_term_debt_to_capital_ratio",
    "financial_leverage_ratio", "debt_to_market_cap",
    "operating_cash_flow_ratio", "operating_cash_flow_sales_ratio",
    "free_cash_flow_operating_cash_flow_ratio",
    "debt_service_coverage_ratio", "interest_coverage_ratio",
    "short_term_operating_cash_flow_coverage_ratio",
    "operating_cash_flow_coverage_ratio",
    "capital_expenditure_coverage_ratio",
    "dividend_paid_and_capex_coverage_ratio",
    "dividend_payout_ratio", "dividend_yield", "dividend_yield_percentage",
    "dividend_per_share", "revenue_per_share", "net_income_per_share",
    "interest_debt_per_share", "cash_per_share", "book_value_per_share",
    "tangible_book_value_per_share", "shareholders_equity_per_share",
    "operating_cash_flow_per_share", "capex_per_share",
    "free_cash_flow_per_share", "net_income_per_ebt", "ebt_per_ebit",
    "effective_tax_rate", "enterprise_value_multiple"
]

percentile_levels = [0.01,0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,0.99]
labels = ["<1%","10%","20%","30%","40%","50%","60%","70%","80%","90%","99%",">99%"]

BATCH_SIZE = 10

# --------------------------
# Helpers
# --------------------------
def read_financial_metrics() -> pl.DataFrame:
    conn = get_postgres_connection()
    cur = conn.cursor()
    buf = io.BytesIO()
    cur.copy_expert("COPY raw.financial_metrics TO STDOUT WITH CSV HEADER", buf)
    buf.seek(0)
    df = pl.read_csv(buf, infer_schema_length=10000)
    cur.close()
    conn.close()
    return df.with_columns([
        pl.col("symbol").cast(pl.Utf8),
        pl.col("date").str.strptime(pl.Date, strict=False),
        pl.col("fiscal_year").cast(pl.Int64),
        pl.col("period").cast(pl.Utf8),
        pl.col("reported_currency").cast(pl.Utf8),
    ])

def bucketize_metric(df: pl.DataFrame, metric: str) -> pl.DataFrame:
    non_null = df.select(identity_columns + [metric]).filter(pl.col(metric).is_not_null())
    if non_null.height == 0:
        return pl.DataFrame({"symbol": [], "date": [], f"{metric}_perc": []})
    qs = non_null.select([
        pl.col(metric).quantile(q, "nearest").alias(f"p{int(q*100):02d}")
        for q in percentile_levels
    ]).row(0)
    bins = [-np.inf] + list(qs) + [np.inf]
    lows, highs = bins[:-1], bins[1:]
    def fmt(x): return "-âˆž" if x == float("-inf") else "+âˆž" if x == float("inf") else f"{x:.2f}"
    bracket_display = [f"{lab} ({fmt(lo)} â€“ {fmt(hi)})" for lab, lo, hi in zip(labels, lows, highs)]
    mapping_df = pl.DataFrame({"_low": lows, "_high": highs, "_label": bracket_display})
    return (
        non_null.lazy()
        .join(mapping_df.lazy(), how="cross")
        .filter((pl.col(metric) >= pl.col("_low")) & (pl.col(metric) < pl.col("_high")))
        .select(["symbol","date",pl.col("_label").alias(f"{metric}_perc")])
        .collect()
    )

# --------------------------
# 1) Read raw + identities
# --------------------------
print("ðŸ“¥ Reading raw.financial_metrics â€¦")
df_raw = read_financial_metrics()
df_base = df_raw.select(identity_columns).unique()

# --------------------------
# 2) Drop + recreate target
# --------------------------
print("ðŸ§¨ Dropping + recreating target table â€¦")
with engine.begin() as conn:
    conn.execute(text(f"DROP TABLE IF EXISTS {target_table}"))
    ddl = f"""
        CREATE TABLE {target_table} (
            symbol TEXT,
            date DATE,
            fiscal_year INTEGER,
            period TEXT,
            reported_currency TEXT
        ) WITH (fillfactor=100);
    """
    conn.execute(text(ddl))

# Insert identities once
pg_conn = psycopg2.connect(DATABASE_URL.replace("postgresql+psycopg2","postgresql"))
cur = pg_conn.cursor()
buf = io.StringIO()
df_base.to_pandas().to_csv(buf, sep="\t", index=False, header=False, na_rep="\\N")
buf.seek(0)
cols_sql = ", ".join([f'"{c}"' for c in identity_columns])
cur.copy_expert(f"COPY {target_table} ({cols_sql}) FROM STDIN WITH (FORMAT text, DELIMITER E'\\t', NULL '\\N')", buf)
pg_conn.commit()

# --------------------------
# 3) Process metrics in batches
# --------------------------
for i in range(0, len(metrics), BATCH_SIZE):
    batch = metrics[i:i+BATCH_SIZE]
    print(f"ðŸ“Š Processing batch {i//BATCH_SIZE+1}: {batch}")

    batch_df = df_base
    for m in batch:
        df_m = bucketize_metric(df_raw, m)
        if df_m.height > 0:
            batch_df = batch_df.join(df_m, on=["symbol","date"], how="left")

    pdf = batch_df.to_pandas()
    pdf["date"] = pd.to_datetime(pdf["date"]).dt.date

    # Create staging
    staging = f"{staging_schema}.tmp_batch_{i}"
    cur.execute(f"DROP TABLE IF EXISTS {staging}")
    cur.execute(f'CREATE TABLE {staging} (symbol TEXT, date DATE, ' +
                ", ".join([f'"{m}_perc" TEXT' for m in batch]) + ")")
    pg_conn.commit()

    # COPY batch into staging
    buf = io.StringIO()
    pdf[["symbol","date"] + [f"{m}_perc" for m in batch]].to_csv(
        buf, sep="\t", index=False, header=False, na_rep="\\N"
    )
    buf.seek(0)
    cols = ["symbol","date"] + [f"{m}_perc" for m in batch]
    cols_sql = ", ".join([f'"{c}"' for c in cols])
    cur.copy_expert(f"COPY {staging} ({cols_sql}) FROM STDIN WITH (FORMAT text, DELIMITER E'\\t', NULL '\\N')", buf)
    pg_conn.commit()

    # Ensure target has the columns
    with engine.begin() as conn:
        for m in batch:
            conn.execute(text(f'ALTER TABLE {target_table} ADD COLUMN IF NOT EXISTS "{m}_perc" TEXT'))

        set_clause = ", ".join([f'"{m}_perc" = s."{m}_perc"' for m in batch])
        conn.execute(text(f"""
            UPDATE {target_table} t
            SET {set_clause}
            FROM {staging} s
            WHERE t.symbol = s.symbol
              AND t.date = s.date
        """))

    cur.execute(f"DROP TABLE IF EXISTS {staging}")
    pg_conn.commit()
    print(f"âœ… Batch {i//BATCH_SIZE+1} merged.")

cur.close()
pg_conn.close()
print("ðŸŽ‰ Done â€” all batches merged into final table without overwriting.")


ðŸ“¥ Reading raw.financial_metrics â€¦
ðŸ§¨ Dropping + recreating target table â€¦
ðŸ“Š Processing batch 1: ['gross_profit_margin', 'ebit_margin', 'ebitda_margin', 'operating_profit_margin', 'pretax_profit_margin', 'continuous_operations_profit_margin', 'net_profit_margin', 'bottom_line_profit_margin', 'current_ratio', 'quick_ratio']
âœ… Batch 1 merged.
ðŸ“Š Processing batch 2: ['solvency_ratio', 'cash_ratio', 'receivables_turnover', 'payables_turnover', 'inventory_turnover', 'fixed_asset_turnover', 'asset_turnover', 'working_capital_turnover_ratio', 'price_to_earnings_ratio', 'price_to_earnings_growth_ratio']
âœ… Batch 2 merged.
ðŸ“Š Processing batch 3: ['forward_price_to_earnings_growth_ratio', 'price_to_book_ratio', 'price_to_sales_ratio', 'price_to_free_cash_flow_ratio', 'price_to_operating_cash_flow_ratio', 'price_to_fair_value', 'debt_to_assets_ratio', 'debt_to_equity_ratio', 'debt_to_capital_ratio', 'long_term_debt_to_capital_ratio']
âœ… Batch 3 merged.
ðŸ“Š Processing batch 4: 