### Mount Google Drive, Import Libraries and Define Paths

In [3]:
# =============================================================================
# ENVIRONMENT SETUP + PATH CONFIGURATION (SERVER / COLAB COMPATIBLE)
# =============================================================================

import os
import sys
import importlib
from pathlib import Path
import string
import re
import gc
from datetime import timedelta
from scipy.stats.mstats import winsorize
import shutil

# -----------------------------------------------------------------------------
# 0) HARD SAFETY: cap native thread usage (prevents pthread_create EAGAIN)
#    MUST be set before importing numpy / scipy / pandas
# -----------------------------------------------------------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["BLIS_NUM_THREADS"] = "1"

# -----------------------------------------------------------------------------
# 1) Detect environment
# -----------------------------------------------------------------------------
IN_COLAB = "google.colab" in sys.modules

# -----------------------------------------------------------------------------
# 2) (Colab only) Mount Google Drive
# -----------------------------------------------------------------------------
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    BASE_PATH = "/content/drive/MyDrive/Colab Notebooks"
else:
    # Server base path (your target)
    BASE_PATH = "/home/jovyan/work/hpool1/pseidel/test"

print("IN_COLAB:", IN_COLAB)
print("BASE_PATH:", BASE_PATH)

# -----------------------------------------------------------------------------
# 3) Sanity checks: path exists + write permission
# -----------------------------------------------------------------------------
BASE = Path(BASE_PATH)
if not BASE.exists():
    raise FileNotFoundError(f"BASE_PATH does not exist: {BASE}")

# quick write test (fails fast if you don't have permissions)
test_file = BASE / ".write_test_tmp"
try:
    test_file.write_text("ok", encoding="utf-8")
    test_file.unlink()
except Exception as e:
    raise PermissionError(f"No write permission in {BASE}. Error: {e}")

# -----------------------------------------------------------------------------
# 4) Environment check: ensure required packages import cleanly
# -----------------------------------------------------------------------------
required_packages = ["numpy", "scipy", "pandas", "linearmodels", "xlsxwriter"]

for pkg in required_packages:
    print(f"Importing {pkg} ...")
    importlib.import_module(pkg)
    print(f"{pkg} OK")

import numpy as np
import pandas as pd

# -----------------------------------------------------------------------------
# 5) Base paths and input/output locations
# -----------------------------------------------------------------------------
Input_file_path   = str(BASE / "Input")
Temp_file_path    = str(BASE / "Temp")
Output_file_path  = str(BASE / "Output")

Fundamentals_file_path = f"{Input_file_path}/WSFV_f_20250131.txt"
Current_file_path      = f"{Input_file_path}/WSCurrent_f_20250131.txt"
Calendar_file_path     = f"{Input_file_path}/WSCalendarPrd_f_20250131.txt"
Meta_file_path         = f"{Input_file_path}/WSMetaData_f_20250131.txt"
Excel_file_path        = f"{Input_file_path}/WS PIT Table Definitions V5 with start dates.xls"

MarketValues_file_path          = f"{Input_file_path}/Daily MV USD"
MarketValues_file_path_LC       = f"{Input_file_path}/Daily MV LC"
DailyTotalReturns_file_path     = f"{Input_file_path}/Daily Returns USD"
DailyIndexReturns_file_path     = f"{Input_file_path}/Daily Index Returns USD"
Constituents_file_path          = f"{Input_file_path}/Constituents.01.csv"
UniversalMatching_file_path     = f"{Input_file_path}/Universal Matching File"

Temp_file_path_GO  = f"{Temp_file_path}/TempGeneralOverview"
Temp_file_path_EoC = f"{Temp_file_path}/TempExtractionofCharacteristics"
Temp_file_path_DP  = f"{Temp_file_path}/TempDataPreparation"
Temp_file_path_A   = f"{Temp_file_path}/TempAnomalies"
Temp_file_path_R   = f"{Temp_file_path}/TempRegressionModel"

Relevant_items_path   = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_A = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_B = f"{Input_file_path}/RelevantItemsB.txt"
Relevant_items_path_C = f"{Input_file_path}/RelevantItemsC.txt"
Relevant_items_path_D = f"{Input_file_path}/RelevantItemsD.txt"

Subset_file_path = f"{Temp_file_path_GO}/Subsets"
Fundamentals_clean_file_path = f"{Temp_file_path_GO}/Fundamentals_clean.txt"
Current_clean_file_path      = f"{Temp_file_path_GO}/Current_clean.txt"
Calendar_clean_file_path     = f"{Temp_file_path_GO}/Input/Calendar_clean.txt"
Meta_clean_file_path         = f"{Temp_file_path_GO}/Input/Meta_clean.txt"

# -----------------------------------------------------------------------------
# 6) Ensure required directories exist
# -----------------------------------------------------------------------------
Path(Output_file_path).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_GO).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_EoC).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_A).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_R).mkdir(parents=True, exist_ok=True)
Path(Subset_file_path).mkdir(parents=True, exist_ok=True)
Path(Path(Calendar_clean_file_path).parent).mkdir(parents=True, exist_ok=True)

# -----------------------------------------------------------------------------
# 7) Streaming / deduplication settings
# -----------------------------------------------------------------------------
CHUNK_SIZE = 2_000_000
DATE_COL = "PIT Date"
DEDUP_KEYS = ["ID", "ItemCode", DATE_COL]

print("Paths configured. Temp outputs ->", Temp_file_path_GO)
print("Example input path ->", Fundamentals_file_path)


IN_COLAB: False
BASE_PATH: /home/jovyan/work/hpool1/pseidel/test
Importing numpy ...
numpy OK
Importing scipy ...
scipy OK
Importing pandas ...
pandas OK
Importing linearmodels ...
linearmodels OK
Importing xlsxwriter ...
xlsxwriter OK
Paths configured. Temp outputs -> /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview
Example input path -> /home/jovyan/work/hpool1/pseidel/test/Input/WSFV_f_20250131.txt


In [4]:
!free -h

               total        used        free      shared  buff/cache   available
Mem:           754Gi       253Gi       294Gi        55Mi       215Gi       501Gi
Swap:             0B          0B          0B


# 1.0. Data

### Import and Rename Data

In [5]:
# ================================================================
# Summary:
# This cell scans INPUT_DIR for *.txt files whose names end with
# "_complete_subset.txt" and start with either "Annualized_" or
# "Mixed_". Matching files are copied to OUTPUT_DIR and renamed to
# "<core>.txt", where <core> is the middle part of the filename.
# Existing target files are not overwritten.
# ================================================================


# === CONFIGURATION ===
INPUT_DIR = Path(Temp_file_path_DP)    # Directory containing source .txt files
OUTPUT_DIR = Path(Temp_file_path_A)    # Directory where renamed files are stored
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)  # Ensure output directory exists
# =====================

new_files = []  # List to track newly created output filenames

# Iterate through all .txt files in the input directory
for file in INPUT_DIR.glob("*.txt"):
    name = file.name          # Extract the filename
    core = None               # Will hold the extracted core name

    # Case: filename starts with "Annualized_" and ends with "_complete_subset.txt"
    # Example: Annualized_ABC_complete_subset.txt -> ABC.txt
    if name.startswith("Annualized_") and name.endswith("_complete_subset.txt"):
        core = name[len("Annualized_"):-len("_complete_subset.txt")]

    # Case: filename starts with "Mixed_" and ends with "_complete_subset.txt"
    # Example: Mixed_ABC_complete_subset.txt -> ABC.txt
    elif name.startswith("Mixed_") and name.endswith("_complete_subset.txt"):
        core = name[len("Mixed_"):-len("_complete_subset.txt")]

    # Case: filename starts with "Special_" and ends with "_complete_subset.txt"
    # Example: Special_ABC_complete_subset.txt -> ABC.txt
    elif name.startswith("Special_") and name.endswith("_complete_subset.txt"):
        core = name[len("Special_"):-len("_complete_subset.txt")]

    # Skip files that do not match the required pattern
    if not core:
        continue

    # Construct the new filename: <core>.txt
    new_name = f"{core}.txt"
    dest = OUTPUT_DIR / new_name  # Full destination path

    # If the destination file already exists, skip copying
    if dest.exists():
        print(f"Skipping {name}: target {new_name} already exists.")
        continue

    # Attempt to copy the file, preserving metadata
    try:
        shutil.copy2(file, dest)
        new_files.append(new_name)  # Track successful copies
    except OSError as e:
        print(f"[ERROR] Failed to copy {file} -> {dest}: {e}")

# Output the list of copied and renamed files
print("\nCopied & renamed files:")
for f in sorted(new_files):
    print(f)

# Also show all .txt files currently in the output directory
print("\nAll .txt files currently in OUTPUT_DIR:")
for f in sorted(p.name for p in OUTPUT_DIR.glob("*.txt")):
    print(f)



Copied & renamed files:
Accounts_Payable.txt
Capital_Expenditures_Addtns_to_Fixed_Assets.txt
Cash_Dividends_Paid___Total.txt
Cash__Short_Term_Investments.txt
Com_Pfd_Redeemed_Retired_Converted_Etc..txt
Common_Equity.txt
Cost_of_Goods_Sold_Excl_Depreciation.txt
Current_Assets___Total.txt
Current_Liabilities___Total.txt
Deferred_Taxes.txt
Depreciation_Depletion__Amortization.txt
Disposal_of_Fixed_Assets.txt
Earnings_Per_Share_Fiscal_Year_End.txt
Extraordinary_Items.txt
Funds_From_For_Other_Operating_Activities.txt
Funds_From_Operations.txt
Income_Taxes.txt
Income_Taxes_Payable.txt
Interest_Expense___Total.txt
Inventories___Total.txt
Investments_in_Associated_Companies.txt
Investments_in_Sales__Direct_Financing_Leases.txt
Long_Term_Borrowings.txt
Long_Term_Debt.txt
Long_Term_Receivables.txt
Minority_Interest.txt
Net_Cash_Flow___Financing.txt
Net_Cash_Flow___Investing.txt
Net_Cash_Flow___Operating_Activities.txt
Net_Income_Before_Extra_Items_Preferred_Divs.txt
Net_Income_Used_to_Calculate

### Check for Empty or Error Rows

In [6]:
from pathlib import Path
import os
import pandas as pd

# =======================================================================
# Summary:
# This cell processes all *.txt files in OUTPUT_DIR (no chunking).
#
# AnnPITValue filtering:
#   - Rows with empty AnnPITValue are REMOVED
#   - Non-numeric AnnPITValue values are TRACKED (but not removed)
#   - Suspiciously small non-zero numeric values are DETECTED:
#       * Default:            0 < |value| < 0.01
#       * Strict files:       0 < |value| < 1.0   (all 0.x values)
#       * Round-to-zero files 0 < |value| < 0.05  -> rounded to 0 (not removed)
#     -> Normally, these rows are REMOVED
#     -> EXCEPTION: for files in FILES_WITHOUT_SMALL_FILTER, they are
#        only REPORTED but NOT REMOVED.
#
# Optional Q_lag0..Q_lag7 filtering (only for selected files):
#   - Empty values are explicitly ALLOWED
#   - Non-numeric values are TRACKED (but not removed)
#   - Suspiciously small non-zero numeric values are DETECTED:
#       0 < |value| < 0.01
#     -> Normally, these rows are REMOVED
#     -> EXCEPTION: for files in FILES_WITHOUT_SMALL_FILTER, they are
#        only REPORTED but NOT REMOVED.
#
# All files are rewritten in-place using a temporary file.
# Detailed statistics and example values are printed at the end.
# =======================================================================

# === CONFIGURATION ===
OUTPUT_DIR = Path(Temp_file_path_A)   # Directory containing the files to validate
SEP = "|"                             # Input files are pipe-separated

# ONLY these files will receive the Q_lag0..Q_lag7 filtering
FILES_WITH_Q_LAG_FILTER = { #### Just for the case that I add quarters back

}

# Files where the "near-zero" filter (0 < |value| < 0.01) should NOT
# actually remove rows. For these files, we still DETECT and REPORT
# suspiciously small values, but we DO NOT DROP them.
FILES_WITHOUT_SMALL_FILTER = {
    "Earnings_Per_Share_Fiscal_Year_End.txt",
    "Sales_Per_Share.txt",
}

# Files where we want a STRICT small-value filter for AnnPITValue:
# here, ANY non-zero 0.x value (0 < |value| < 1.0) is considered too small.
FILES_WITH_STRICT_SMALL_FILTER = {
    "Total_Assets.txt",
    "Total_Liabilities.txt",
    "Net_Sales_or_Revenues.txt",
}

# Files where small AnnPITValue values should be rounded to 0 instead of
# being dropped. Threshold: 0 < |value| < 0.05.
FILES_WITH_ROUND_SMALL_TO_ZERO = {
    "Net_Income_Before_Extra_Items_Preferred_Divs.txt",
    "Net_Income_Used_to_Calculate_Basic_EPS.txt",
}

# Define the Q_lag columns explicitly
Q_LAG_COLUMNS = [f"Q_lag{i}" for i in range(8)]
# =====================

# ---- GLOBAL TRACKING DICTIONARIES ----

rows_dropped_annpit = {}          # Rows dropped because AnnPITValue was empty
non_numeric_in_value = {}         # Non-numeric AnnPITValue tracking
small_magnitude_in_value = {}     # Small-magnitude AnnPITValue tracking

# Q_lag statistics only for selected files
non_numeric_in_qleg = {}          # Non-numeric Q_lag tracking
small_magnitude_in_qleg = {}      # Small-magnitude Q_lag tracking


# =======================================================================
#                               MAIN LOOP
# =======================================================================

for path in sorted(OUTPUT_DIR.glob("*.txt")):
    fname = path.name
    tmp_path = path.with_suffix(".tmp")

    # Remove any leftover temp file from previous runs
    if tmp_path.exists():
        tmp_path.unlink()

    # --- Load entire file (NO CHUNKING) ---
    try:
        df = pd.read_csv(path, sep=SEP, dtype=str)
    except Exception as e:
        print(f"[WARN] Skipping {fname}: cannot read ({e})")
        continue

    # --- Ensure AnnPITValue exists ---
    if "AnnPITValue" not in df.columns:
        print(f"[WARN] {fname}: no 'AnnPITValue' column found.")
        continue

    # ===================================================================
    #                     AnnPITValue: EMPTY REMOVAL
    # ===================================================================

    # Use the original column for NaN detection, and a string version for whitespace checks
    col_ann = df["AnnPITValue"]
    s_ann = col_ann.astype(str)

    # Drop rows where AnnPITValue is NaN or only whitespace / empty
    mask_non_empty = (~col_ann.isna()) & (s_ann.str.strip() != "")
    total_dropped_empty = int(len(df) - mask_non_empty.sum())

    # Keep only rows with non-empty AnnPITValue
    df = df[mask_non_empty]

    # ===================================================================
    #               AnnPITValue: NUMERIC VALIDATION
    # ===================================================================

    s = df["AnnPITValue"].fillna("").astype(str).str.strip()
    coerced = pd.to_numeric(s, errors="coerce")

    # --- Track NON-NUMERIC values (DO NOT remove) ---
    mask_nn = (s != "") & coerced.isna()
    nn_count = int(mask_nn.sum())
    nn_examples = {}

    if nn_count > 0:
        vc = s[mask_nn].value_counts()
        for val, cnt in vc.items():
            if len(nn_examples) < 10 or val in nn_examples:
                nn_examples[val] = nn_examples.get(val, 0) + int(cnt)

    # --- Filter / detect SUSPICIOUSLY SMALL NON-ZERO numeric values ---
    # Default threshold:   0.01
    # Strict files:        1.0  (so all 0.x values are treated as too small)
    # Round-to-zero files: 0.05 (values below are set to 0, not dropped)
    if fname in FILES_WITH_STRICT_SMALL_FILTER:
        small_threshold = 1.0
    elif fname in FILES_WITH_ROUND_SMALL_TO_ZERO:
        small_threshold = 0.05
    else:
        small_threshold = 0.01

    mask_numeric = ~coerced.isna()
    mask_small = mask_numeric & (coerced != 0) & (coerced.abs() < small_threshold)

    small_match_count = int(mask_small.sum())   # how many values match the criterion
    small_dropped = 0                           # how many rows are actually removed
    small_examples = {}

    if small_match_count > 0:
        vc_small = s[mask_small].value_counts()
        for val, cnt in vc_small.items():
            if len(small_examples) < 10 or val in small_examples:
                small_examples[val] = small_examples.get(val, 0) + int(cnt)

        if fname in FILES_WITH_ROUND_SMALL_TO_ZERO:
            # Round small AnnPITValue entries to 0 instead of dropping rows
            df.loc[mask_small, "AnnPITValue"] = "0"
            small_dropped = 0
        elif fname not in FILES_WITHOUT_SMALL_FILTER:
            # Default behaviour: drop rows with small values
            df = df[~mask_small]
            small_dropped = small_match_count
        # If the file is in FILES_WITHOUT_SMALL_FILTER, we leave df unchanged
        # but still report the suspicious values.

    # ===================================================================
    #           OPTIONAL: Q_lag0..Q_lag7 FILTERING (SELECTED FILES)
    # ===================================================================

    qleg_non_numeric_stats = {}
    qleg_small_stats = {}

    if fname in FILES_WITH_Q_LAG_FILTER:
        for col in Q_LAG_COLUMNS:

            # Skip if column does not exist
            if col not in df.columns:
                continue

            # Convert to clean string
            s_q = df[col].fillna("").astype(str).str.strip()
            coerced_q = pd.to_numeric(s_q, errors="coerce")

            # --- Track NON-NUMERIC values (but DO NOT drop) ---
            mask_nn_q = (s_q != "") & coerced_q.isna()
            c_nn_q = int(mask_nn_q.sum())
            nn_examples_q = {}

            if c_nn_q > 0:
                vc_nn_q = s_q[mask_nn_q].value_counts()
                for val, cnt in vc_nn_q.items():
                    if len(nn_examples_q) < 10 or val in nn_examples_q:
                        nn_examples_q[val] = nn_examples_q.get(val, 0) + int(cnt)

            # --- Filter / detect SUSPICIOUSLY SMALL NON-ZERO numeric values ---
            mask_numeric_q = ~coerced_q.isna()
            mask_small_q = (
                mask_numeric_q
                & (coerced_q != 0)
                & (coerced_q.abs() < 0.01)  # Q_lag still uses the standard 0.01 threshold
            )

            c_small_match_q = int(mask_small_q.sum())   # matches
            c_small_dropped_q = 0                       # actually dropped rows
            small_examples_q = {}

            if c_small_match_q > 0:
                vc_small_q = s_q[mask_small_q].value_counts()
                for val, cnt in vc_small_q.items():
                    if len(small_examples_q) < 10 or val in small_examples_q:
                        small_examples_q[val] = small_examples_q.get(val, 0) + int(cnt)

                # Only DROP rows if this file is NOT in the exclusion list
                if fname not in FILES_WITHOUT_SMALL_FILTER:
                    df = df[~mask_small_q]
                    c_small_dropped_q = c_small_match_q
                # If the file is in FILES_WITHOUT_SMALL_FILTER, we only report.

            # --- Store per-column tracking ---
            qleg_non_numeric_stats[col] = {
                "count_non_numeric": c_nn_q,
                "example_values": dict(
                    sorted(nn_examples_q.items(), key=lambda x: -x[1])[:10]
                ),
            }

            qleg_small_stats[col] = {
                "count_small_magnitude": c_small_match_q,
                "count_dropped": c_small_dropped_q,
                "example_values": dict(
                    sorted(small_examples_q.items(), key=lambda x: -x[1])[:10]
                ),
            }

    # ===================================================================
    #                     WRITE CLEANED FILE
    # ===================================================================

    df.to_csv(tmp_path, index=False, sep=SEP)
    os.replace(tmp_path, path)

    # ===================================================================
    #                     STORE FILE STATISTICS
    # ===================================================================

    rows_dropped_annpit[fname] = total_dropped_empty

    non_numeric_in_value[fname] = {
        "count_non_numeric": nn_count,
        "example_values": dict(
            sorted(nn_examples.items(), key=lambda x: -x[1])[:10]
        ),
    }

    small_magnitude_in_value[fname] = {
        "count_small_magnitude": small_match_count,
        "count_dropped": small_dropped,
        "example_values": dict(
            sorted(small_examples.items(), key=lambda x: -x[1])[:10]
        ),
    }

    if qleg_non_numeric_stats:
        non_numeric_in_qleg[fname] = qleg_non_numeric_stats
        small_magnitude_in_qleg[fname] = qleg_small_stats


# =======================================================================
#                             FINAL REPORT
# =======================================================================

print("\nRows dropped due to empty 'AnnPITValue':")
for name, dropped in sorted(rows_dropped_annpit.items()):
    print(f"{name}: {dropped} rows dropped")

print("\nNon-numeric checks for 'AnnPITValue' column:")
for name, info in sorted(non_numeric_in_value.items()):
    c = info["count_non_numeric"]
    if c == 0:
        print(f"{name}: OK (all numeric)")
    else:
        print(f"{name}: {c} non-numeric entries. Examples: {info['example_values']}")

print("\nSuspiciously small non-zero 'AnnPITValue' values (per-file threshold):")
for name, info in sorted(small_magnitude_in_value.items()):
    m = info["count_small_magnitude"]
    d = info["count_dropped"]
    if m == 0:
        print(f"{name}: OK (no suspiciously small values)")
    elif name in FILES_WITH_ROUND_SMALL_TO_ZERO:
        print(
            f"{name}: {m} suspiciously small values rounded to 0 (no rows dropped). "
            f"Examples: {info['example_values']}"
        )
    elif d == 0:
        print(
            f"{name}: {m} suspiciously small values found, but 0 rows dropped "
            f"(filter disabled for this file). Examples: {info['example_values']}"
        )
    else:
        print(
            f"{name}: {d} rows dropped out of {m} suspiciously small values. "
            f"Examples: {info['example_values']}"
        )

print("\nQ_lag0..Q_lag7 checks (only for files in FILES_WITH_Q_LAG_FILTER):")
for name in sorted(non_numeric_in_qleg.keys()):
    print(f"\n{name}:")
    for col in Q_LAG_COLUMNS:
        if col not in non_numeric_in_qleg[name]:
            continue

        nn_info = non_numeric_in_qleg[name][col]
        sm_info = small_magnitude_in_qleg[name][col]

        c_nn = nn_info["count_non_numeric"]
        m_sm = sm_info["count_small_magnitude"]
        d_sm = sm_info["count_dropped"]

        if c_nn == 0 and m_sm == 0:
            print(f"  {col}: OK")
        else:
            print(f"  {col}: {c_nn} non-numeric, {m_sm} suspiciously small non-zero values")
            if c_nn > 0:
                print(f"    Non-numeric examples: {nn_info['example_values']}")
            if m_sm > 0 and d_sm == 0:
                print(
                    f"    Small-magnitude examples (no rows dropped for this file): "
                    f"{sm_info['example_values']}"
                )
            if d_sm > 0:
                print(
                    f"    Small-magnitude examples ({d_sm} rows dropped): "
                    f"{sm_info['example_values']}"
                )


[WARN] DroppedFromFFMerge.txt: no 'AnnPITValue' column found.
[WARN] FF_Benchmark_Factors_Merged_Clean.txt: no 'AnnPITValue' column found.
[WARN] FF_Benchmark_Factors_Merged_Clean_Replication.txt: no 'AnnPITValue' column found.
[WARN] Factors_Annual_Country.txt: no 'AnnPITValue' column found.
[WARN] Factors_Annual_Global.txt: no 'AnnPITValue' column found.
[WARN] Factors_Annual_Replication.txt: no 'AnnPITValue' column found.
[WARN] Factors_Daily_Country.txt: no 'AnnPITValue' column found.
[WARN] Factors_Daily_Global.txt: no 'AnnPITValue' column found.
[WARN] Factors_Daily_Replication.txt: no 'AnnPITValue' column found.
[WARN] data_Factors.txt: no 'AnnPITValue' column found.
[WARN] processed_data_Factors.txt: no 'AnnPITValue' column found.

Rows dropped due to empty 'AnnPITValue':
Accounts_Payable.txt: 0 rows dropped
Capital_Expenditures_Addtns_to_Fixed_Assets.txt: 589227 rows dropped
Cash_Dividends_Paid___Total.txt: 502590 rows dropped
Cash__Short_Term_Investments.txt: 0 rows dropped
C

### Remove Duplicates

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
import gc

# ============================================================================
# Summary:
# This cell loads each *.txt file in OUTPUT_DIR and removes duplicate rows
# based on a key consisting of:
#   ID, CompanyName, HistCurrency, FiscalPeriod, AnnPITValue_Period, and
#   AnnPITValue (rounded to two decimal places), while using PIT Date to
#   determine row ordering.
#
# Additionally, if the columns lag0, lag1, ..., lag7 exist in a dataset,
# they are treated as *part of the key* as well. That means:
#   - Rows are only considered duplicates if the base key AND all lag0–lag7
#     values match.
#   - If any Q_lag* value differs, the row is kept.
#
# If both 'Value' and 'AnnPITValue' exist, 'Value' is dropped automatically.
#
# NEW:
#   - Rows with missing/empty AnnPITValue_Period are removed before dedup.
#   - AnnPITValue_Period is included in the dedup key, so rows with the same
#     AnnPITValue but different periods (e.g., 'A' vs 'Q4') are not treated
#     as duplicates.
# ============================================================================

# === CONFIGURATION ===
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"
KEY_COLS_BASE = ["ID", "CompanyName", "HistCurrency", "FiscalPeriod"]
PERIOD_COL = "AnnPITValue_Period"
PIT_COL = "PIT Date"
LAG_COLS = [f"Q_lag{i}" for i in range(8)]
# =====================

dedup_dropped = {}

for path in sorted(OUTPUT_DIR.glob("*.txt")):
    fname = path.name

    try:
        df = pd.read_csv(path, sep=SEP)
    except Exception as e:
        print(f"[WARN] {fname}: could not load for dedup ({e})")
        continue

    # Drop 'Value' if both exist
    if "Value" in df.columns and "AnnPITValue" in df.columns:
        df = df.drop(columns=["Value"])

    # Ensure all required columns are present
    required_cols = KEY_COLS_BASE + ["AnnPITValue", PERIOD_COL, PIT_COL]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        print(f"[WARN] {fname}: missing columns {missing} — skipped dedup.")
        del df
        gc.collect()
        continue

    # --- NEW: drop rows with missing/empty AnnPITValue_Period ---
    # Keep only rows where AnnPITValue_Period is not NaN and not empty/whitespace
    period_series = df[PERIOD_COL]
    mask_nonempty_period = period_series.notna() & (period_series.astype(str).str.strip() != "")
    rows_before_period_filter = len(df)
    df = df.loc[mask_nonempty_period].copy()
    rows_dropped_empty_period = rows_before_period_filter - len(df)
    if rows_dropped_empty_period > 0:
        print(f"{fname}: dropped {rows_dropped_empty_period} rows with empty {PERIOD_COL}")

    # If nothing remains after this filter, just overwrite the file with the empty df
    if df.empty:
        df.to_csv(path, index=False, sep=SEP)
        del df
        gc.collect()
        dedup_dropped[fname] = 0
        continue

    # Type conversions
    df[PIT_COL] = pd.to_datetime(df[PIT_COL], errors="coerce")
    df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce").astype("Int64")

    annpit_num = pd.to_numeric(df["AnnPITValue"], errors="coerce")
    df["_AnnPIT_rounded"] = annpit_num.round(2)

    # Sorting
    sort_cols = ["ID", "HistCurrency", PIT_COL, "FiscalPeriod"]
    if PERIOD_COL in df.columns:
        sort_cols.append(PERIOD_COL)
    df = df.sort_values(by=sort_cols)

    # Deduplication key: include AnnPITValue_Period
    key_cols = KEY_COLS_BASE + [PERIOD_COL, "_AnnPIT_rounded"]

    # lag extension
    if all(col in df.columns for col in LAG_COLS):
        key_cols = key_cols + LAG_COLS

    # Deduplication
    dup_mask = df.duplicated(subset=key_cols, keep="first")
    dropped = int(dup_mask.sum())
    dedup_dropped[fname] = dropped

    df = df[~dup_mask].drop(columns=["_AnnPIT_rounded"])

    df.to_csv(path, index=False, sep=SEP)

    del df
    gc.collect()

# ---- FINAL SUMMARY OUTPUT ----
print(
    "\nRows dropped due to duplicates "
    "(ID+CompanyName+HistCurrency+FiscalPeriod+AnnPITValue_Period+AnnPITValue[2dp]"
    " [+Q_lag0–Q_lag7 if present]):"
)
for name, dropped in sorted(dedup_dropped.items()):
    print(f"{name}: {dropped} rows dropped")


[WARN] Capital_Expenditures_Addtns_to_Fixed_Assets.txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] Cash_Dividends_Paid___Total.txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] Com_Pfd_Redeemed_Retired_Converted_Etc..txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] Cost_of_Goods_Sold_Excl_Depreciation.txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] Depreciation_Depletion__Amortization.txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] Disposal_of_Fixed_Assets.txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] DroppedFromFFMerge.txt: missing columns ['CompanyName', 'AnnPITValue'] — skipped dedup.
[WARN] Earnings_Per_Share_Fiscal_Year_End.txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] Extraordinary_Items.txt: missing columns ['AnnPITValue_Period'] — skipped dedup.
[WARN] FF_Benchmark_Factors_Merged_Clean.txt: missing columns ['CompanyName', 'AnnPITValue'] — 

### Clean RAM

In [8]:
# ============================================================================
# Summary:
# This cell removes pipeline-related variables from the global namespace to
# reduce memory usage and avoid unintended side effects in later cells.
# After deleting variables that were created across the pipeline, garbage
# collection is triggered to free memory, and a confirmation message is printed.
# ============================================================================

# List of variable names to remove from the global namespace
to_delete = [
    "INPUT_DIR", "OUTPUT_DIR", "SEP", "CHUNK_SIZE",
    "rows_dropped_annpit", "non_numeric_in_value",
    "dedup_dropped", "KEY_COLS_BASE", "PIT_COL",
    "path", "fname", "tmp_path",
    "total_dropped", "nn_count", "nn_examples",
    "reader", "first_chunk_written", "saw_no_annpit_warn", "saw_no_value_warn",
    "dup_mask", "key_cols", "annpit_num",
]

# Loop through the list and delete each variable if it exists
for var in to_delete:
    if var in globals():
        del globals()[var]

# Trigger garbage collection to clean up unused objects
gc.collect()

# Confirmation output
print("Pipeline-specific variables cleared from memory.")


Pipeline-specific variables cleared from memory.


# 2.0. Anomalies

## Gather Required Inputs per Anomaly in PIT Format

#### Acc

In [9]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""

from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILES = [
    "Total_Assets.txt",
    "Current_Assets___Total.txt",
    "Cash__Short_Term_Investments.txt",
    "Current_Liabilities___Total.txt",
    "Short_Term_Debt__Current_Portion_of_LT_Debt.txt",
    "Income_Taxes_Payable.txt",
    "Depreciation_Depletion__Amortization.txt",
    # add more file names here if needed ...
]

VALUE_COLUMN_NAMES = [
    "at",
    "ca",
    "cce",
    "cl",
    "std",
    "itp",
    "da",
    # add more names here, one for each input file ...
]

OUTPUT_FILE = "data_Acc.txt"

ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 13.4 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Current_Assets___Total.txt: 876,857 unique (ID, FiscalPeriod) combinations
Cash__Short_Term_Investments.txt: 875,450 unique (ID, FiscalPeriod) combinations
Current_Liabilities___Total.txt: 878,053 unique (ID, FiscalPeriod) combinations
Short_Term_Debt__Current_Portion_of_LT_Debt.txt: 852,733 unique (ID, FiscalPeriod) combinations
Income_Taxes_Payable.txt: 559,826 unique (ID, FiscalPeriod) combinations
Depreciation_Depletion__Amortization.txt: 806,654 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 4,218,013 rows and was built in 8.1 seconds.

--- Starting as-of merges ---
[1/7] Merging value column 'at' ...
    Done in 8.7 seconds. Result currently has 4,218,013 r

#### Ag

In [10]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Total_Assets.txt",

    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "at",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Ag.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"

ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 2.4 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 2,354,025 rows and was built in 1.6 seconds.

--- Starting as-of merges ---
[1/1] Merging value column 'at' ...
    Done in 5.0 seconds. Result currently has 2,354,025 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 871,391

Empty (NaN) values per value column:
  - at: 0 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Ag.txt
Total runtime: 20.1 seconds.


#### At

In [11]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Total_Assets.txt",
    "Cash__Short_Term_Investments.txt",
    "Long_Term_Debt.txt",
    "Minority_Interest.txt",
    "Preferred_Stock.txt",
    "Common_Equity.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "at",
    "cce",
    "ltd",
    "mi",
    "ps",
    "ce",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_At.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 15.0 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Cash__Short_Term_Investments.txt: 875,450 unique (ID, FiscalPeriod) combinations
Long_Term_Debt.txt: 872,391 unique (ID, FiscalPeriod) combinations
Minority_Interest.txt: 850,691 unique (ID, FiscalPeriod) combinations
Preferred_Stock.txt: 861,971 unique (ID, FiscalPeriod) combinations
Common_Equity.txt: 883,328 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 4,251,336 rows and was built in 8.8 seconds.

--- Starting as-of merges ---
[1/7] Merging value column 'rev' ...
    Done in 9.0 seconds. Result currently has 4,251,336 rows.
[2/7] Merging value column 'at' ...
    Done in 8.8 seconds. R

#### Cat

In [12]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Total_Assets.txt",
    "Cash__Short_Term_Investments.txt",
    "Long_Term_Debt.txt",
    "Minority_Interest.txt",
    "Preferred_Stock.txt",
    "Common_Equity.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "at",
    "cce",
    "ltd",
    "mi",
    "ps",
    "ce",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Cat.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---


Loading and preprocessing finished in 14.8 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Cash__Short_Term_Investments.txt: 875,450 unique (ID, FiscalPeriod) combinations
Long_Term_Debt.txt: 872,391 unique (ID, FiscalPeriod) combinations
Minority_Interest.txt: 850,691 unique (ID, FiscalPeriod) combinations
Preferred_Stock.txt: 861,971 unique (ID, FiscalPeriod) combinations
Common_Equity.txt: 883,328 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 4,251,336 rows and was built in 8.7 seconds.

--- Starting as-of merges ---
[1/7] Merging value column 'rev' ...
    Done in 8.5 seconds. Result currently has 4,251,336 rows.
[2/7] Merging value column 'at' ...
    Done in 9.3 seconds. Result currently has 4,251,336 rows.
[3/7] Merging value column 'cce' ...
    Done in 9.0 seconds. Result 

#### Cpm

In [13]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Cost_of_Goods_Sold_Excl_Depreciation.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "cogs",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Cpm.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.0 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Cost_of_Goods_Sold_Excl_Depreciation.txt: 818,491 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,785,864 rows and was built in 5.1 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'rev' ...
    Done in 6.3 seconds. Result currently has 1,785,864 rows.
[2/2] Merging value column 'cogs' ...
    Done in 6.9 seconds. Result currently has 1,785,864 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 795,439

Empty (NaN) values per value column:
  - rev: 36,843 empty values
  - cogs: 135,793 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomali

#### Ec

In [14]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Earnings_Per_Share_Fiscal_Year_End.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "eps",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Ec.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.3 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Earnings_Per_Share_Fiscal_Year_End.txt: 840,371 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,888,303 rows and was built in 2.7 seconds.

--- Starting as-of merges ---
[1/1] Merging value column 'eps' ...
    Done in 10.6 seconds. Result currently has 1,888,303 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 840,371

Empty (NaN) values per value column:
  - eps: 0 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Ec.txt
Total runtime: 26.3 seconds.


#### Es (Yearly Adaption)

In [15]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Earnings_Per_Share_Fiscal_Year_End.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "eps",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Es.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.3 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Earnings_Per_Share_Fiscal_Year_End.txt: 840,371 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,888,303 rows and was built in 2.8 seconds.

--- Starting as-of merges ---
[1/1] Merging value column 'eps' ...
    Done in 10.7 seconds. Result currently has 1,888,303 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 840,371

Empty (NaN) values per value column:
  - eps: 0 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Es.txt
Total runtime: 25.2 seconds.


#### Gp

In [16]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Cost_of_Goods_Sold_Excl_Depreciation.txt",
    "Total_Assets.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "cogs",
    "at",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Gp.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 6.5 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Cost_of_Goods_Sold_Excl_Depreciation.txt: 818,491 unique (ID, FiscalPeriod) combinations
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 4,139,889 rows and was built in 7.5 seconds.

--- Starting as-of merges ---
[1/3] Merging value column 'rev' ...
    Done in 8.7 seconds. Result currently has 4,139,889 rows.
[2/3] Merging value column 'cogs' ...
    Done in 9.2 seconds. Result currently has 4,139,889 rows.
[3/3] Merging value column 'at' ...
    Done in 8.5 seconds. Result currently has 4,139,889 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 745,149

Empty (NaN) 

#### Ig

In [17]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Total_Assets.txt",
    "Inventories___Total.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "at",
    "inv",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Ig.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.6 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Inventories___Total.txt: 875,280 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 2,549,446 rows and was built in 2.7 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'at' ...
    Done in 5.7 seconds. Result currently has 2,549,446 rows.
[2/2] Merging value column 'inv' ...
    Done in 5.7 seconds. Result currently has 2,549,446 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 862,033

Empty (NaN) values per value column:
  - at: 58,231 empty values
  - inv: 286,582 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Ig.txt
Total runtime: 

#### Inv

In [18]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Capital_Expenditures_Addtns_to_Fixed_Assets.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "capex",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Inv.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 3.1 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Capital_Expenditures_Addtns_to_Fixed_Assets.txt: 790,635 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,550,264 rows and was built in 3.8 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'rev' ...
    Done in 5.9 seconds. Result currently has 1,550,264 rows.
[2/2] Merging value column 'capex' ...
    Done in 4.7 seconds. Result currently has 1,550,264 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 770,345

Empty (NaN) values per value column:
  - rev: 31,207 empty values
  - capex: 258,934 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/Te

#### Ltg

In [19]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Total_Assets.txt",
    "Current_Assets___Total.txt",
    "Cash__Short_Term_Investments.txt",
    "Current_Liabilities___Total.txt",
    "Short_Term_Debt__Current_Portion_of_LT_Debt.txt",
    "Income_Taxes_Payable.txt",
    "Depreciation_Depletion__Amortization.txt",
    "ReceivablesNet.txt",
    "Inventories___Total.txt",
    "Other_Current_Assets.txt",
    "Property_Plant__Equipment___Net.txt",
    "Other_Assets___Total.txt",
    "Accounts_Payable.txt",
    "Other_Current_Liabilities.txt",
    "Other_Liabilities.txt",

    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "at",
    "ca",
    "cce",
    "cl",
    "std",
    "itp",
    "da",
    "ar",
    "inv",
    "oca",
    "ppe",
    "oa",
    "ap",
    "ocl",
    "ol",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Ltg.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 31.3 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Current_Assets___Total.txt: 876,857 unique (ID, FiscalPeriod) combinations
Cash__Short_Term_Investments.txt: 875,450 unique (ID, FiscalPeriod) combinations
Current_Liabilities___Total.txt: 878,053 unique (ID, FiscalPeriod) combinations
Short_Term_Debt__Current_Portion_of_LT_Debt.txt: 852,733 unique (ID, FiscalPeriod) combinations
Income_Taxes_Payable.txt: 559,826 unique (ID, FiscalPeriod) combinations
Depreciation_Depletion__Amortization.txt: 806,654 unique (ID, FiscalPeriod) combinations
ReceivablesNet.txt: 872,009 unique (ID, FiscalPeriod) combinations
Inventories___Total.txt: 875,280 unique (ID, FiscalPeriod) combinations
Other_Current_Assets.txt: 860,532 unique (ID, FiscalPeriod) combinations
Prope

#### Nca (Special Case with IVAO)

In [20]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods),
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations are "complete" under the following rule:
    * All non-IVAO_ value columns must be non-missing.
    * Across all IVAO_ columns, it is enough if at least ONE IVAO_ column
      has a non-missing value (they are treated as a single logical block).
- At the end, the code reports how many missing values exist for each
  value column (IVAO_ columns included, individually).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""

from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILES = [
    "Other_Investments.txt",
    "Investments_in_Associated_Companies.txt",
    "Long_Term_Receivables.txt",
    "Investments_in_Sales__Direct_Financing_Leases.txt",
    "Unspecified_Other_Loans.txt",
    "Total_Assets.txt",
    "Current_Assets___Total.txt",
    "Total_Liabilities.txt",
    "Short_Term_Debt__Current_Portion_of_LT_Debt.txt",
    "Long_Term_Debt.txt",
]

VALUE_COLUMN_NAMES = [
    "IVAO_oi",
    "IVAO_iac",
    "IVAO_ltr",
    "IVAO_isdfl",
    "IVAO_uol",
    "at",
    "ca",
    "lt",
    "std",
    "ltd",
]

OPTIONAL_INPUT_FILES = {
    "Other_Investments.txt",
    "Investments_in_Associated_Companies.txt",
    "Long_Term_Receivables.txt",
    "Investments_in_Sales__Direct_Financing_Leases.txt",
    "Unspecified_Other_Loans.txt",
}

OUTPUT_FILE = "data_Nca.txt"

ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )

print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Loads a single dataset from disk and prepares the columns.
    Keeps AnnPITValue_Period if present; otherwise creates it as NA.
    """

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Reorder for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Enforce consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Builds the base frame with all unique identifier combinations,
    including AnnPITValue_Period.
    """

    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    base = base.drop_duplicates().reset_index(drop=True)

    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    base = base.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Performs a single as-of merge of one dataset into the base table.

    Period logic:
    - If df has non-NA AnnPITValue_Period values → period-aware:
      grouping is (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period).
    - If AnnPITValue_Period is all NA → period-agnostic:
      grouping is (ID, HistCurrency, FiscalPeriod) only, so values can
      be used for all periods.
    """

    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Combine
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Ordering for forward-fill
    combined["__order"] = combined["__marker"].map({"df": 0, "base": 1}).astype("int8")

    # Sort keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]
    combined = combined.sort_values(sort_cols)

    # Forward-fill within group
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    result = combined[combined["__marker"] == "base"].copy()
    result = result.drop(columns=["__marker", "__order"]).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate=False
):
    """
    Main function that loads, merges, and writes the final dataset.
    """

    paths = [OUTPUT_DIR / f for f in input_files]

    dfs = []
    missing_optional_files = []

    # Load all input datasets
    for file_name, path in zip(input_files, paths):
        if not path.exists():
            if file_name in OPTIONAL_INPUT_FILES:
                print(f"Optional file missing → filled with NaN: {file_name}")
                missing_optional_files.append(file_name)
                # Empty df with correct columns
                dfs.append(pd.DataFrame(columns=BASE_COLS + [PERIOD_COL, VALUE_COL]))
            else:
                raise FileNotFoundError(f"Required file missing: {path}")
        else:
            dfs.append(load_dataset(path))

    # Base frame
    base = build_base_frame(dfs)

    # Merge
    result = base
    for df, col_name in zip(dfs, value_column_names):
        result = asof_merge_one(result, df, col_name)

    # Keep identifier + period + value columns
    result = result[BASE_COLS + [PERIOD_COL] + value_column_names]

    # Write result
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")

    if missing_optional_files:
        print("\nOptional files that were missing:")
        for f in missing_optional_files:
            print(f"  - {f}")
    else:
        print("\nNo optional files were missing.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies


  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)
  combined = pd.concat([base_tmp, df_tmp], ignore_index=True)



Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Nca.txt

No optional files were missing.


#### Noa

In [21]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Total_Assets.txt",
    "Cash__Short_Term_Investments.txt",
    "Long_Term_Debt.txt",
    "Minority_Interest.txt",
    "Preferred_Stock.txt",
    "Common_Equity.txt",

    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "at",
    "cce",
    "ltd",
    "mi",
    "ps",
    "ce",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Noa.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 12.9 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Cash__Short_Term_Investments.txt: 875,450 unique (ID, FiscalPeriod) combinations
Long_Term_Debt.txt: 872,391 unique (ID, FiscalPeriod) combinations
Minority_Interest.txt: 850,691 unique (ID, FiscalPeriod) combinations
Preferred_Stock.txt: 861,971 unique (ID, FiscalPeriod) combinations
Common_Equity.txt: 883,328 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 2,899,491 rows and was built in 5.4 seconds.

--- Starting as-of merges ---
[1/6] Merging value column 'at' ...
    Done in 5.8 seconds. Result currently has 2,899,491 rows.
[2/6] Merging value column 'cce' ...
    Done in 5.8 seconds. Result currently has 2,899,491 rows.
[3/6] Merging value column 'ltd' ...
 

#### Nwc

In [22]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Total_Assets.txt",
    "Current_Assets___Total.txt",
    "Cash__Short_Term_Investments.txt",
    "Current_Liabilities___Total.txt",
    "Short_Term_Debt__Current_Portion_of_LT_Debt.txt",

    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "at",
    "ca",
    "cce",
    "cl",
    "std",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Nwc.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 11.2 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Current_Assets___Total.txt: 876,857 unique (ID, FiscalPeriod) combinations
Cash__Short_Term_Investments.txt: 875,450 unique (ID, FiscalPeriod) combinations
Current_Liabilities___Total.txt: 878,053 unique (ID, FiscalPeriod) combinations
Short_Term_Debt__Current_Portion_of_LT_Debt.txt: 852,733 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 2,721,575 rows and was built in 4.6 seconds.

--- Starting as-of merges ---
[1/5] Merging value column 'at' ...
    Done in 5.4 seconds. Result currently has 2,721,575 rows.
[2/5] Merging value column 'ca' ...
    Done in 5.6 seconds. Result currently has 2,721,575 rows.
[3/5] Merging value column 'cce' ...
    Done in 5.7 second

#### Ol

In [23]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Total_Assets.txt",
    "Capital_Expenditures_Addtns_to_Fixed_Assets.txt",
    "Selling_General__Administrative_Expenses.txt",

    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "at",
    "cogs",
    "sga",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Ol.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 5.1 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Capital_Expenditures_Addtns_to_Fixed_Assets.txt: 790,635 unique (ID, FiscalPeriod) combinations
Selling_General__Administrative_Expenses.txt: 721,610 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 3,890,568 rows and was built in 5.8 seconds.

--- Starting as-of merges ---
[1/3] Merging value column 'at' ...
    Done in 8.1 seconds. Result currently has 3,890,568 rows.
[2/3] Merging value column 'cogs' ...
    Done in 7.2 seconds. Result currently has 3,890,568 rows.
[3/3] Merging value column 'sga' ...
    Done in 7.7 seconds. Result currently has 3,890,568 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-em

#### Osc (Special Case with IFO)

In [24]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods),
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILES = [
    "Total_Assets.txt",
    "Total_Liabilities.txt",
    "Current_Assets___Total.txt",
    "Current_Liabilities___Total.txt",
    "Net_Income_Before_Extra_Items_Preferred_Divs.txt",
    "Funds_From_Operations.txt",
    "Extraordinary_Items.txt",
    "Disposal_of_Fixed_Assets.txt",
    "Funds_From_For_Other_Operating_Activities.txt",
    # add more file names here if needed ...
]

VALUE_COLUMN_NAMES = [
    "at",
    "tl",
    "ca",
    "cl",
    "ni_extra",
    "IFO_ffo",
    "IFO_ei",
    "IFO_dofa",
    "IFO_ffooa",
    # add more names here, one for each input file ...
]

OUTPUT_FILE = "data_Osc.txt"

ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required base + value columns are present,
    - keeps AnnPITValue_Period if present (or creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Reorder to a consistent layout
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns + AnnPITValue_Period from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    base = pd.concat([df[BASE_COLS + [PERIOD_COL]] for df in dfs], ignore_index=True)

    base = base.drop_duplicates().reset_index(drop=True)

    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod) and PIT Date
      in the base table, we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has non-NA AnnPITValue_Period values, we additionally require
      AnnPITValue_Period to match (period-aware as-of).
    - If df has AnnPITValue_Period all NA, we ignore the period column
      (period-agnostic) and the values can "fit all" periods.
    """
    # Determine whether this dataset is period-aware
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying input dataframes
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Prepare df_tmp with appropriate columns
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Ensure df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build group and sort columns
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort and forward-fill
    combined = combined.sort_values(sort_cols)
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only base rows
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and resort
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep base identifier columns + period and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 15.4 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Total_Liabilities.txt: 852,750 unique (ID, FiscalPeriod) combinations
Current_Assets___Total.txt: 876,857 unique (ID, FiscalPeriod) combinations
Current_Liabilities___Total.txt: 878,053 unique (ID, FiscalPeriod) combinations
Net_Income_Before_Extra_Items_Preferred_Divs.txt: 842,027 unique (ID, FiscalPeriod) combinations
Funds_From_Operations.txt: 828,759 unique (ID, FiscalPeriod) combinations
Extraordinary_Items.txt: 683,683 unique (ID, FiscalPeriod) combinations
Disposal_of_Fixed_Assets.txt: 648,468 unique (ID, FiscalPeriod) combinations
Funds_From_For_Other_Operating_Activities.txt: 670,513 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 4,406,065 rows and was b

#### Pm

In [25]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Cost_of_Goods_Sold_Excl_Depreciation.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "cogs",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Pm.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.1 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Cost_of_Goods_Sold_Excl_Depreciation.txt: 818,491 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,785,864 rows and was built in 5.1 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'rev' ...
    Done in 6.5 seconds. Result currently has 1,785,864 rows.
[2/2] Merging value column 'cogs' ...
    Done in 6.8 seconds. Result currently has 1,785,864 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 795,439

Empty (NaN) values per value column:
  - rev: 36,843 empty values
  - cogs: 135,793 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomali

#### Poa

In [26]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Income_Before_Extra_Items_Preferred_Divs.txt",
    "Net_Cash_Flow___Operating_Activities.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "ni_extra",
    "opcf",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Poa.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 3.1 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Income_Before_Extra_Items_Preferred_Divs.txt: 842,027 unique (ID, FiscalPeriod) combinations
Net_Cash_Flow___Operating_Activities.txt: 751,599 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,527,513 rows and was built in 3.9 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'ni_extra' ...
    Done in 5.8 seconds. Result currently has 1,527,513 rows.
[2/2] Merging value column 'opcf' ...
    Done in 4.8 seconds. Result currently has 1,527,513 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 751,406

Empty (NaN) values per value column:
  - ni_extra: 12,508 empty values
  - opcf: 237,895 empty values

Final view written to:
/home/jovyan/work/hpo

#### Pro

In [27]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Income_Used_to_Calculate_Basic_EPS.txt",
    "Total_Assets.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "ni_eps",
    "at",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Pro.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.3 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Income_Used_to_Calculate_Basic_EPS.txt: 842,036 unique (ID, FiscalPeriod) combinations
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 3,700,378 rows and was built in 4.8 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'ni_eps' ...
    Done in 8.0 seconds. Result currently has 3,700,378 rows.
[2/2] Merging value column 'at' ...
    Done in 7.6 seconds. Result currently has 3,700,378 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 793,794

Empty (NaN) values per value column:
  - ni_eps: 1,369,132 empty values
  - at: 1,346,353 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomali

#### Pta

In [28]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Income_Before_Extra_Items_Preferred_Divs.txt",
    "Net_Cash_Flow___Operating_Activities.txt",
    "Net_Cash_Flow___Investing.txt",
    "Net_Cash_Flow___Financing.txt",
    "Net_Proceeds_From_Sale_Issue_of_Com__Pfd.txt",
    "Com_Pfd_Redeemed_Retired_Converted_Etc..txt",
    "Cash_Dividends_Paid___Total.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "ni_extra",
    "opcf",
    "invcf",
    "fincf",
    "socaps",
    "pocaps",
    "div",

    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Pta.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 8.2 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Income_Before_Extra_Items_Preferred_Divs.txt: 842,027 unique (ID, FiscalPeriod) combinations
Net_Cash_Flow___Operating_Activities.txt: 751,599 unique (ID, FiscalPeriod) combinations
Net_Cash_Flow___Investing.txt: 745,084 unique (ID, FiscalPeriod) combinations
Net_Cash_Flow___Financing.txt: 747,265 unique (ID, FiscalPeriod) combinations
Net_Proceeds_From_Sale_Issue_of_Com__Pfd.txt: 740,951 unique (ID, FiscalPeriod) combinations
Com_Pfd_Redeemed_Retired_Converted_Etc..txt: 665,679 unique (ID, FiscalPeriod) combinations
Cash_Dividends_Paid___Total.txt: 811,535 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,765,455 rows and was built in 9.6 seconds.

--- Starting as-of merges ---
[1/7] Merging value column 'ni_extra' ...
    

#### Roe

In [29]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Income_Used_to_Calculate_Basic_EPS.txt",
    "Common_Equity.txt",
    "Deferred_Taxes.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "ni_eps",
    "ce",
    "dt",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Roe.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 5.7 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Income_Used_to_Calculate_Basic_EPS.txt: 842,036 unique (ID, FiscalPeriod) combinations
Common_Equity.txt: 883,328 unique (ID, FiscalPeriod) combinations
Deferred_Taxes.txt: 750,382 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 3,785,314 rows and was built in 5.3 seconds.

--- Starting as-of merges ---
[1/3] Merging value column 'ni_eps' ...
    Done in 7.8 seconds. Result currently has 3,785,314 rows.
[2/3] Merging value column 'ce' ...
    Done in 8.1 seconds. Result currently has 3,785,314 rows.
[3/3] Merging value column 'dt' ...
    Done in 7.3 seconds. Result currently has 3,785,314 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 680,276

Empty (NaN) val

#### Rs (Yearly Adaption)

In [30]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Rs.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 1.9 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,351,845 rows and was built in 1.5 seconds.

--- Starting as-of merges ---
[1/1] Merging value column 'rev' ...
    Done in 5.6 seconds. Result currently has 1,351,845 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 817,863

Empty (NaN) values per value column:
  - rev: 0 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Rs.txt
Total runtime: 15.0 seconds.


#### Sg

In [31]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Common_Equity.txt",
    "Deferred_Taxes.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "ce",
    "dt",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Sg.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.1 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Common_Equity.txt: 883,328 unique (ID, FiscalPeriod) combinations
Deferred_Taxes.txt: 750,382 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 2,438,961 rows and was built in 2.4 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'ce' ...
    Done in 5.6 seconds. Result currently has 2,438,961 rows.
[2/2] Merging value column 'dt' ...
    Done in 4.8 seconds. Result currently has 2,438,961 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 746,347

Empty (NaN) values per value column:
  - ce: 52,483 empty values
  - dt: 756,770 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Sg.txt
Total runtime: 29.9 s

#### Sli

In [32]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Inventories___Total.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "inv",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Sli.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 4.2 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Inventories___Total.txt: 875,280 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 3,538,149 rows and was built in 4.9 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'rev' ...
    Done in 7.7 seconds. Result currently has 3,538,149 rows.
[2/2] Merging value column 'inv' ...
    Done in 7.2 seconds. Result currently has 3,538,149 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 799,132

Empty (NaN) values per value column:
  - rev: 1,262,930 empty values
  - inv: 1,351,845 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Sli.tx

#### Slx

In [33]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Sales_or_Revenues.txt",
    "Selling_General__Administrative_Expenses.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "rev",
    "sga",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Slx.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 3.7 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Selling_General__Administrative_Expenses.txt: 721,610 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,694,448 rows and was built in 4.7 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'rev' ...
    Done in 6.3 seconds. Result currently has 1,694,448 rows.
[2/2] Merging value column 'sga' ...
    Done in 6.0 seconds. Result currently has 1,694,448 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 699,730

Empty (NaN) values per value column:
  - rev: 37,706 empty values
  - sga: 402,105 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnoma

#### Tx

In [34]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Income_Taxes.txt",
    "Net_Income_Before_Extra_Items_Preferred_Divs.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "itx",
    "ni_extra",
    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Tx.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 3.6 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Income_Taxes.txt: 819,430 unique (ID, FiscalPeriod) combinations
Net_Income_Before_Extra_Items_Preferred_Divs.txt: 842,027 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 1,451,441 rows and was built in 4.3 seconds.

--- Starting as-of merges ---
[1/2] Merging value column 'itx' ...
    Done in 5.2 seconds. Result currently has 1,451,441 rows.
[2/2] Merging value column 'ni_extra' ...
    Done in 5.7 seconds. Result currently has 1,451,441 rows.

--- Final dataset statistics ---
Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: 819,235

Empty (NaN) values per value column:
  - itx: 98,894 empty values
  - ni_extra: 11,540 empty values

Final view written to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempA

#### Txf

In [35]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""


from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

# Folder where all input files are stored and where the output will be written.
# Temp_file_path_A must already be defined in your notebook, for example:
# Temp_file_path_A = f"{BASE_PATH}/Temp/TempAnomalies"
OUTPUT_DIR = Path(Temp_file_path_A)

# Field separator in all input and output text files
SEP = "|"

# Input files (1..N). They are all expected to be located in OUTPUT_DIR.
INPUT_FILES = [
    "Net_Proceeds_From_Sale_Issue_of_Com__Pfd.txt",
    "Com_Pfd_Redeemed_Retired_Converted_Etc..txt",
    "Cash_Dividends_Paid___Total.txt",
    "Long_Term_Borrowings.txt",
    "Reduction_in_Long_Term_Debt.txt",
    "Total_Assets.txt",
    # add more file names here if needed ...
]

# Names for the value columns corresponding to INPUT_FILES (same length as INPUT_FILES)
VALUE_COLUMN_NAMES = [
    "socaps",
    "pocaps",
    "div",
    "diss",
    "dred",
    "at",

    # add more names here, one for each input file ...
]

# Name of the final output file (will be written to OUTPUT_DIR)
OUTPUT_FILE = "data_Txf.txt"

# Column names in the input files (assumed to be identical in all files)
ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 7.0 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Net_Proceeds_From_Sale_Issue_of_Com__Pfd.txt: 740,951 unique (ID, FiscalPeriod) combinations
Com_Pfd_Redeemed_Retired_Converted_Etc..txt: 665,679 unique (ID, FiscalPeriod) combinations
Cash_Dividends_Paid___Total.txt: 811,535 unique (ID, FiscalPeriod) combinations
Long_Term_Borrowings.txt: 633,163 unique (ID, FiscalPeriod) combinations
Reduction_in_Long_Term_Debt.txt: 658,636 unique (ID, FiscalPeriod) combinations
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 3,798,877 rows and was built in 7.8 seconds.

--- Starting as-of merges ---
[1/6] Merging value column 'socaps' ...
    Done in 6.6 seconds. Result currently has 3,798,877 rows.
[2/6] Merging value column 'pocaps' ...
    Done in 6.7 seconds. Res

## Calculate "In-between-variables" from Data (e.g. Working Capital for Acc)

### Acc

In [36]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Acc.txt"               # <-- Set your input file name here
NEW_COLUMN = "wc"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ca - cce - cl + std + itp - da"  # <-- Formula using column names
FORMULA_COLUMNS = ["ca", "cce", "cl", "std", "itp", "da"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'wc' created based on formula:
    ca - cce - cl + std + itp - da
Rows used (all required columns non-null): 632091
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Acc.txt
            ca         cce          cl         std        itp         da  \
0   748.140365  167.849993  663.515421   54.137880  10.582409  38.094417   
1          NaN         NaN         NaN         NaN        NaN  38.094417   
2   864.906741  320.816779  693.189519   31.500444  12.508193  68.245694   
3          NaN         NaN         NaN         NaN        NaN  68.245694   
4   975.879591  167.728265  773.728435  104.964605   8.751102  89.546780   
5          NaN         NaN         NaN         NaN        NaN  89.546780   
6   762.389782  110.705844  852.251409  364.770602   9.041866  98.504938   
7          NaN         NaN         NaN         NaN        NaN  98.504938   
8   665.184633   70.212302  841.844876  293.363860   9.454759  84.007512   
9          NaN        

### At

#### Oa

In [37]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_At.txt"               # <-- Set your input file name here
NEW_COLUMN = "oa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - cce"  # <-- Formula using column names
FORMULA_COLUMNS = ["at", "cce"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'oa' created based on formula:
    at - cce
Rows used (all required columns non-null): 2474750
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_At.txt
             at         cce           oa
0   1084.355949  167.849993   916.505956
1           NaN         NaN          NaN
2   1267.313578  320.816779   946.496799
3           NaN         NaN          NaN
4   1460.696865  167.728265  1292.968600
5           NaN         NaN          NaN
6   1304.537890  110.705844  1193.832046
7           NaN         NaN          NaN
8   1265.820386   70.212302  1195.608084
9           NaN         NaN          NaN
10  1225.163797   40.206606  1184.957191
11          NaN         NaN          NaN
12  1102.461138   45.182074  1057.279064
13          NaN         NaN          NaN
14   945.993798   18.238665   927.755133
15          NaN         NaN          NaN
16          NaN         NaN          NaN
17          NaN         NaN          NaN
18          NaN         N

#### Ol

In [38]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_At.txt"               # <-- Set your input file name here
NEW_COLUMN = "ol"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - ltd - mi - ps - ce"  # <-- Formula using column names
FORMULA_COLUMNS = ["at", "ltd", "mi", "ps", "ce"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'ol' created based on formula:
    at - ltd - mi - ps - ce
Rows used (all required columns non-null): 1811546
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_At.txt
             at         ltd        mi   ps          ce          ol
0   1084.355949   10.096319  0.000000  0.0  378.411011  695.848619
1           NaN         NaN       NaN  NaN         NaN         NaN
2   1267.313578  106.116741  0.117886  0.0  434.756779  726.322172
3           NaN         NaN       NaN  NaN         NaN         NaN
4   1460.696865  130.770231       NaN  0.0  518.204722         NaN
5           NaN         NaN       NaN  NaN         NaN         NaN
6   1304.537890   32.376442       NaN  0.0  386.142780         NaN
7           NaN         NaN       NaN  NaN         NaN         NaN
8   1265.820386    0.124015       NaN  0.0  387.109014         NaN
9           NaN         NaN       NaN  NaN         NaN         NaN
10  1225.163797   27.130200       NaN  0.0  292.5726

#### Noa

In [39]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_At.txt"               # <-- Set your input file name here
NEW_COLUMN = "noa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "oa - ol"  # <-- Formula using column names
FORMULA_COLUMNS = ["oa", "ol"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'noa' created based on formula:
    oa - ol
Rows used (all required columns non-null): 1676871
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_At.txt
             oa          ol         noa
0    916.505956  695.848619  220.657337
1           NaN         NaN         NaN
2    946.496799  726.322172  220.174627
3           NaN         NaN         NaN
4   1292.968600         NaN         NaN
5           NaN         NaN         NaN
6   1193.832046         NaN         NaN
7           NaN         NaN         NaN
8   1195.608084         NaN         NaN
9           NaN         NaN         NaN
10  1184.957191         NaN         NaN
11          NaN         NaN         NaN
12  1057.279064         NaN         NaN
13          NaN         NaN         NaN
14   927.755133  818.085740  109.669393
15          NaN         NaN         NaN
16          NaN         NaN         NaN
17          NaN         NaN         NaN
18          NaN         NaN         NaN
19  

### Cat

#### Oa

In [40]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Cat.txt"               # <-- Set your input file name here
NEW_COLUMN = "oa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - cce"  # <-- Formula using column names
FORMULA_COLUMNS = ["at", "cce"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'oa' created based on formula:
    at - cce
Rows used (all required columns non-null): 2474750
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Cat.txt
             at         cce           oa
0   1084.355949  167.849993   916.505956
1           NaN         NaN          NaN
2   1267.313578  320.816779   946.496799
3           NaN         NaN          NaN
4   1460.696865  167.728265  1292.968600
5           NaN         NaN          NaN
6   1304.537890  110.705844  1193.832046
7           NaN         NaN          NaN
8   1265.820386   70.212302  1195.608084
9           NaN         NaN          NaN
10  1225.163797   40.206606  1184.957191
11          NaN         NaN          NaN
12  1102.461138   45.182074  1057.279064
13          NaN         NaN          NaN
14   945.993798   18.238665   927.755133
15          NaN         NaN          NaN
16          NaN         NaN          NaN
17          NaN         NaN          NaN
18          NaN         

#### Ol

In [41]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Cat.txt"               # <-- Set your input file name here
NEW_COLUMN = "ol"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - ltd - mi - ps - ce"  # <-- Formula using column names
FORMULA_COLUMNS = ["at", "ltd", "mi", "ps", "ce"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'ol' created based on formula:
    at - ltd - mi - ps - ce
Rows used (all required columns non-null): 1811546
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Cat.txt
             at         ltd        mi   ps          ce          ol
0   1084.355949   10.096319  0.000000  0.0  378.411011  695.848619
1           NaN         NaN       NaN  NaN         NaN         NaN
2   1267.313578  106.116741  0.117886  0.0  434.756779  726.322172
3           NaN         NaN       NaN  NaN         NaN         NaN
4   1460.696865  130.770231       NaN  0.0  518.204722         NaN
5           NaN         NaN       NaN  NaN         NaN         NaN
6   1304.537890   32.376442       NaN  0.0  386.142780         NaN
7           NaN         NaN       NaN  NaN         NaN         NaN
8   1265.820386    0.124015       NaN  0.0  387.109014         NaN
9           NaN         NaN       NaN  NaN         NaN         NaN
10  1225.163797   27.130200       NaN  0.0  292.572

#### Noa

In [42]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Cat.txt"               # <-- Set your input file name here
NEW_COLUMN = "noa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "oa - ol"  # <-- Formula using column names
FORMULA_COLUMNS = ["oa", "ol"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'noa' created based on formula:
    oa - ol
Rows used (all required columns non-null): 1676871
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Cat.txt
             oa          ol         noa
0    916.505956  695.848619  220.657337
1           NaN         NaN         NaN
2    946.496799  726.322172  220.174627
3           NaN         NaN         NaN
4   1292.968600         NaN         NaN
5           NaN         NaN         NaN
6   1193.832046         NaN         NaN
7           NaN         NaN         NaN
8   1195.608084         NaN         NaN
9           NaN         NaN         NaN
10  1184.957191         NaN         NaN
11          NaN         NaN         NaN
12  1057.279064         NaN         NaN
13          NaN         NaN         NaN
14   927.755133  818.085740  109.669393
15          NaN         NaN         NaN
16          NaN         NaN         NaN
17          NaN         NaN         NaN
18          NaN         NaN         NaN
19 

### Cpm

In [43]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Cpm.txt"               # <-- Set your input file name here
NEW_COLUMN = "pm"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(rev - cogs) / rev"  # <-- Formula using column names
FORMULA_COLUMNS = ["rev", "cogs"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'pm' created based on formula:
    (rev - cogs) / rev
Rows used (all required columns non-null): 1613228
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Cpm.txt
            rev         cogs        pm
0   1707.334371  1415.675687  0.170827
1   1921.517802  1621.318491  0.156230
2   2263.705199  1968.910031  0.130227
3   1447.331160  1270.982588  0.121844
4   1217.952697  1007.403728  0.172871
5    693.936079   578.488211  0.166367
6    693.936079   512.071639  0.262077
7    609.279252   453.554285  0.255589
8    559.817319          NaN       NaN
9    501.495166   294.514572  0.412727
10          NaN     0.056176       NaN
11          NaN     0.339360       NaN
12     8.063120     6.484731  0.195754
13   212.478635   197.252834  0.071658
14   341.614996   330.298911  0.033125
15   466.963321   343.942030  0.263450
16   473.429402   353.821975  0.252640
17   528.266494   392.873539  0.256297
18   590.330351   435.289043  0.262635
19   567.4943

### Inv

In [44]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Inv.txt"               # <-- Set your input file name here
NEW_COLUMN = "ce"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "capex / rev"  # <-- Formula using column names
FORMULA_COLUMNS = ["capex", "rev"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'ce' created based on formula:
    capex / rev
Rows used (all required columns non-null): 1260123
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Inv.txt
         capex          rev        ce
0    24.714956  1707.334371  0.014476
1    63.338554  1921.517802  0.032963
2    85.214171  2263.705199  0.037644
3   106.865080  1447.331160  0.073836
4    53.811107  1217.952697  0.044182
5    45.147594   693.936079  0.065060
6    46.740995   609.279252  0.076715
7          NaN   559.817319       NaN
8    23.600276   501.495166  0.047060
9          NaN     8.063120       NaN
10         NaN   212.478635       NaN
11    2.984188   341.614996  0.008736
12   12.138323   466.963321  0.025994
13   21.196735   473.429402  0.044773
14   17.974979   528.266494  0.034026
15   37.822982   590.330351  0.064071
16   24.328241   567.494357  0.042870
17   33.391428   600.805773  0.055578
18   41.963815   672.248976  0.062423
19   41.963815   672.248976  0.062423


### Ltg

#### Noa

In [45]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Ltg.txt"               # <-- Set your input file name here
NEW_COLUMN = "noa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(ar + inv + oca + ppe + oa - ap - ocl - ol) / at"  # <-- Formula using column names
FORMULA_COLUMNS = ["ar", "inv", "oca", "ppe", "oa", "ap", "ocl", "ol", "at"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'noa' created based on formula:
    (ar + inv + oca + ppe + oa - ap - ocl - ol) / at
Rows used (all required columns non-null): 1111788
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Ltg.txt
            ar         inv        oca         ppe         oa          ap  \
0   341.028066  237.409600   1.852706  247.049311  34.493544  380.421036   
1          NaN         NaN        NaN         NaN        NaN         NaN   
2   329.248663  201.182443  13.658856  334.155739  45.444753  369.240530   
3          NaN         NaN        NaN         NaN        NaN         NaN   
4   490.270995  289.925405  27.954926  391.012073  59.092809  401.581395   
5          NaN         NaN        NaN         NaN        NaN         NaN   
6   390.538840  223.628192  37.516906  466.898780  27.082113  228.545754   
7          NaN         NaN        NaN         NaN        NaN         NaN   
8   377.209917  177.096926  40.665488  418.603235  57.544877  264.087772   
9 

#### Wc

In [46]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Ltg.txt"               # <-- Set your input file name here
NEW_COLUMN = "wc"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ca - cce - cl + std + itp - da"  # <-- Formula using column names
FORMULA_COLUMNS = ["ca", "cce", "cl", "std", "itp", "da"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'wc' created based on formula:
    ca - cce - cl + std + itp - da
Rows used (all required columns non-null): 805508
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Ltg.txt
            ca         cce          cl         std        itp         da  \
0   748.140365  167.849993  663.515421   54.137880  10.582409  38.094417   
1          NaN         NaN         NaN         NaN        NaN  38.094417   
2   864.906741  320.816779  693.189519   31.500444  12.508193  68.245694   
3          NaN         NaN         NaN         NaN        NaN  68.245694   
4   975.879591  167.728265  773.728435  104.964605   8.751102  89.546780   
5          NaN         NaN         NaN         NaN        NaN  89.546780   
6   762.389782  110.705844  852.251409  364.770602   9.041866  98.504938   
7          NaN         NaN         NaN         NaN        NaN  98.504938   
8   665.184633   70.212302  841.844876  293.363860   9.454759  84.007512   
9          NaN        

### Nca

#### Ivao (Special Case for Sum)

In [47]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Nca.txt"               # <-- Set your input file name here
NEW_COLUMN = "ivao"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "IVAO_oi + IVAO_iac + IVAO_ltr + IVAO_isdfl + IVAO_uol"  # <-- Formula using column names
FORMULA_COLUMNS = ["IVAO_oi", "IVAO_iac", "IVAO_ltr", "IVAO_isdfl", "IVAO_uol"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) COPY DF AND APPLY FORMULA USING 0 FOR NaN VALUES
df_out = df.copy()

# Fill only the necessary columns with 0 for the formula evaluation
df_filled = df_out[FORMULA_COLUMNS].fillna(0)

# Evaluate formula on df_filled
df_out[NEW_COLUMN] = df_filled.eval(FORMULA_EXPRESSION)

# 4) SAVE RESULT AS "<INPUT_FILE>" IN THE SAME FOLDER
output_path = OUTPUT_DIR / INPUT_FILE
df_out.to_csv(output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula (NaN treated as 0):")
print("   ", FORMULA_EXPRESSION)
print("Rows processed:", len(df_out))
print("Result was saved to:", output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'ivao' created based on formula (NaN treated as 0):
    IVAO_oi + IVAO_iac + IVAO_ltr + IVAO_isdfl + IVAO_uol
Rows processed: 3108973
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Nca.txt
      IVAO_oi   IVAO_iac   IVAO_ltr  IVAO_isdfl  IVAO_uol        ivao
0         NaN  52.290086   2.382643         NaN       NaN   54.672729
1         NaN  12.353737  10.452608         NaN       NaN   22.806345
2         NaN  15.324648  19.387744         NaN       NaN   34.712392
3         NaN  14.287399  33.879816         NaN       NaN   48.167215
4         NaN  74.116172  50.371469         NaN       NaN  124.487641
5         NaN  65.308715  28.137731         NaN       NaN   93.446446
6         NaN  73.291530  40.098311         NaN       NaN  113.389841
7         NaN        NaN        NaN         NaN       NaN    0.000000
8         NaN  43.957836  33.687150         NaN       NaN   77.644986
9         NaN   0.000000        NaN         NaN       NaN    0.0

#### Oa

In [48]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Nca.txt"               # <-- Set your input file name here
NEW_COLUMN = "oa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - ca - ivao - lt + std + ltd"  # <-- Formula using column names
FORMULA_COLUMNS = ["at", "ca", "ivao", "lt", "std", "ltd"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'oa' created based on formula:
    at - ca - ivao - lt + std + ltd
Rows used (all required columns non-null): 2093162
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Nca.txt
             at          ca        ivao           lt         std         ltd  \
0   1084.355949  748.140365   54.672729   705.944938   54.137880   10.096319   
1   1267.313578  864.906741   22.806345   832.438913   31.500444  106.116741   
2   1460.696865  975.879591   34.712392   942.485069  104.964605  130.770231   
3   1304.537890  762.389782   48.167215   918.389404  364.770602   32.376442   
4   1265.820386  665.184633  124.487641   878.706712  293.363860    0.124015   
5   1225.163797  615.713580   93.446446   932.585018  354.393108   27.130200   
6   1102.461138  486.157681  113.389841  1029.779408  427.774122   18.295417   
7    945.993798  433.210510    0.000000   833.640821  196.429136   15.555081   
8    945.993798  433.210510   77.644986   833.640821  196.42

### Noa

#### Oa

In [49]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Noa.txt"               # <-- Set your input file name here
NEW_COLUMN = "oa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - cce"  # <-- Formula using column names
FORMULA_COLUMNS = ["at", "cce"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'oa' created based on formula:
    at - cce
Rows used (all required columns non-null): 2474750
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Noa.txt
             at         cce           oa
0   1084.355949  167.849993   916.505956
1   1267.313578  320.816779   946.496799
2   1460.696865  167.728265  1292.968600
3   1304.537890  110.705844  1193.832046
4   1265.820386   70.212302  1195.608084
5   1225.163797   40.206606  1184.957191
6   1102.461138   45.182074  1057.279064
7    945.993798   18.238665   927.755133
8           NaN         NaN          NaN
9           NaN         NaN          NaN
10    23.439270    0.346747    23.092523
11   577.207234    6.375740   570.831494
12  1132.139668   14.410069  1117.729599
13  1242.711417   24.837275  1217.874142
14  1373.608080   36.502948  1337.105132
15  1263.968574   30.218984  1233.749590
16  1301.804038   26.510843  1275.293195
17   820.980063   11.388751   809.591312
18   903.394131   29.985

#### Ol

In [50]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Noa.txt"               # <-- Set your input file name here
NEW_COLUMN = "ol"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - ltd - mi - ps - ce"  # <-- Formula using column names
FORMULA_COLUMNS = ["at", "ltd", "mi", "ps", "ce"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'ol' created based on formula:
    at - ltd - mi - ps - ce
Rows used (all required columns non-null): 1811546
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Noa.txt
             at         ltd        mi        ps          ce          ol
0   1084.355949   10.096319  0.000000  0.000000  378.411011  695.848619
1   1267.313578  106.116741  0.117886  0.000000  434.756779  726.322172
2   1460.696865  130.770231       NaN  0.000000  518.204722         NaN
3   1304.537890   32.376442       NaN  0.000000  386.142780         NaN
4   1265.820386    0.124015       NaN  0.000000  387.109014         NaN
5   1225.163797   27.130200       NaN  0.000000  292.572607         NaN
6   1102.461138   18.295417       NaN  0.000000   72.674506         NaN
7    945.993798   15.555081  0.000000  0.000000  112.352977  818.085740
8           NaN    0.042286  0.000000       NaN    0.129111         NaN
9           NaN    0.167289  0.000000       NaN    0.613125         

#### Noa

In [51]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Noa.txt"               # <-- Set your input file name here
NEW_COLUMN = "noa"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "oa - ol"  # <-- Formula using column names
FORMULA_COLUMNS = ["oa", "ol"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'noa' created based on formula:
    oa - ol
Rows used (all required columns non-null): 1676871
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Noa.txt
             oa          ol         noa
0    916.505956  695.848619  220.657337
1    946.496799  726.322172  220.174627
2   1292.968600         NaN         NaN
3   1193.832046         NaN         NaN
4   1195.608084         NaN         NaN
5   1184.957191         NaN         NaN
6   1057.279064         NaN         NaN
7    927.755133  818.085740  109.669393
8           NaN         NaN         NaN
9           NaN         NaN         NaN
10    23.092523         NaN         NaN
11   570.831494  114.275464  456.556030
12  1117.729599  297.337037  820.392562
13  1217.874142  284.431322  933.442820
14  1337.105132  383.020060  954.085072
15  1233.749590  318.885597  914.863993
16  1275.293195  310.640199  964.652996
17   809.591312  298.013730  511.577582
18   873.408968  245.839375  627.569593
19 

### Nwc

In [52]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Nwc.txt"               # <-- Set your input file name here
NEW_COLUMN = "nwc"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ca - cce - cl + std"  # <-- Formula using column names
FORMULA_COLUMNS = ["ca", "cce", "cl", "std"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'nwc' created based on formula:
    ca - cce - cl + std
Rows used (all required columns non-null): 1923094
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Nwc.txt
            ca         cce          cl         std         nwc
0   748.140365  167.849993  663.515421   54.137880  -29.087169
1   864.906741  320.816779  693.189519   31.500444 -117.599113
2   975.879591  167.728265  773.728435  104.964605  139.387496
3   762.389782  110.705844  852.251409  364.770602  164.203131
4   665.184633   70.212302  841.844876  293.363860   46.491315
5   615.713580   40.206606  876.621681  354.393108   53.278401
6   486.157681   45.182074  978.028694  427.774122 -109.278965
7   433.210510   18.238665  777.957861  196.429136 -166.556880
8     0.031517         NaN    0.019879         NaN         NaN
9     0.148791         NaN    0.136090         NaN         NaN
10    4.175760    0.346747    3.380897         NaN         NaN
11   75.003871    6.375740  114.275

### Osc

#### Ifo (Special Case for Sum)

In [53]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Osc.txt"               # <-- Set your input file name here
NEW_COLUMN = "ifo"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "IFO_ffo + IFO_ei + IFO_dofa + IFO_ffooa"  # <-- Formula using column names
FORMULA_COLUMNS = ["IFO_ffo", "IFO_ei", "IFO_dofa", "IFO_ffooa"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) COPY DF AND APPLY FORMULA USING 0 FOR NaN VALUES
df_out = df.copy()

# Fill only the necessary columns with 0 for the formula evaluation
df_filled = df_out[FORMULA_COLUMNS].fillna(0)

# Evaluate formula on df_filled
df_out[NEW_COLUMN] = df_filled.eval(FORMULA_EXPRESSION)

# 4) SAVE RESULT AS "<INPUT_FILE>" IN THE SAME FOLDER
output_path = OUTPUT_DIR / INPUT_FILE
df_out.to_csv(output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula (NaN treated as 0):")
print("   ", FORMULA_EXPRESSION)
print("Rows processed:", len(df_out))
print("Result was saved to:", output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'ifo' created based on formula (NaN treated as 0):
    IFO_ffo + IFO_ei + IFO_dofa + IFO_ffooa
Rows processed: 4406065
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Osc.txt
       IFO_ffo     IFO_ei  IFO_dofa   IFO_ffooa         ifo
0    70.253522   0.000000       0.0  -87.320244  -17.066722
1    70.253522   0.000000       0.0  -87.320244  -17.066722
2   240.925386  -8.619558       0.0  200.312706  432.618534
3   240.925386  -8.619558       0.0  200.312706  432.618534
4   317.659199 -25.858138       0.0   47.756416  339.557477
5   317.659199 -25.858138       0.0   47.756416  339.557477
6   112.426119 -83.558991       0.0  361.917728  390.784856
7   112.426119 -83.558991       0.0  361.917728  390.784856
8   129.039000 -13.904000       0.0  246.214426  361.349426
9   129.039000 -13.904000       0.0  246.214426  361.349426
10  132.910697 -17.775388       0.0  246.214426  361.349735
11         NaN -16.955000       0.0  168.673517  151.718517

### Roe

In [54]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Roe.txt"               # <-- Set your input file name here
NEW_COLUMN = "be"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ce + dt"  # <-- Formula using column names
FORMULA_COLUMNS = ["ce", "dt"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'be' created based on formula:
    ce + dt
Rows used (all required columns non-null): 1629708
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Roe.txt
            ce        dt          be
0   378.411011       NaN         NaN
1          NaN       NaN         NaN
2   434.756779       NaN         NaN
3          NaN       NaN         NaN
4   518.204722       NaN         NaN
5          NaN       NaN         NaN
6   386.142780       NaN         NaN
7          NaN       NaN         NaN
8   387.109014       NaN         NaN
9          NaN       NaN         NaN
10  292.572607  0.000000  292.572607
11         NaN       NaN         NaN
12   72.674506  1.902010   74.576516
13         NaN       NaN         NaN
14  112.352977  1.252426  113.605403
15         NaN       NaN         NaN
16         NaN       NaN         NaN
17    0.129111       NaN         NaN
18         NaN       NaN         NaN
19    0.613125       NaN         NaN


### Sg

In [55]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out` and written to disk as
#    "formula_<INPUT_FILE>" in the same folder.
#
# Examples for FORMULA_EXPRESSION:
# - "(at - xt) / ft"
# - "col1 + col2 + col3"
# - "(colA - colB) * colC"
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Sg.txt"               # <-- Set your input file name here
NEW_COLUMN = "be"             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ce + dt"  # <-- Formula using column names
FORMULA_COLUMNS = ["ce", "dt"]   # <-- All columns required to be non-null
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
# We use DataFrame.eval so column names can directly be used in FORMULA_EXPRESSION.
# To avoid evaluating on rows that are not allowed, we apply eval on the subset
# and then assign back.
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION
)

# 6) SAVE RESULT AS "formula_<INPUT_FILE>" IN THE SAME FOLDER
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out[[*FORMULA_COLUMNS, NEW_COLUMN]].head(20))


New column 'be' created based on formula:
    ce + dt
Rows used (all required columns non-null): 1629708
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Sg.txt
            ce        dt          be
0   378.411011       NaN         NaN
1   434.756779       NaN         NaN
2   518.204722       NaN         NaN
3   386.142780       NaN         NaN
4   387.109014       NaN         NaN
5   292.572607  0.000000  292.572607
6    72.674506  1.902010   74.576516
7   112.352977  1.252426  113.605403
8     0.129111       NaN         NaN
9     0.613125       NaN         NaN
10   16.638718       NaN         NaN
11  437.845511       NaN         NaN
12  805.404692  0.000000  805.404692
13  899.303060       NaN         NaN
14  920.553592       NaN         NaN
15  742.381585       NaN         NaN
16  796.586597       NaN         NaN
17  320.569078  0.000000  320.569078
18  350.219678  0.000000  350.219678
19  419.494337  8.414000  427.908337


## Add Required Lagged Columns

### Acc

In [56]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, a lagged column "<col>_lag1".
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lag1" is the most recent known value of <col> for
#      FiscalPeriod - 1 *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: 2020 Q1 will lag to 2019 Q1 (same period label), not to any
#      value from 2020.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Acc.txt"          # Name of the input file to read
VALUE_COLUMNS = ["at", "wc"]         # Columns for which to compute the lag "<col>_lag1"

PERIOD_COL = "AnnPITValue_Period"    # NEW: period label used for lag (Q1, Q2, A, etc.)
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
# Make sure period labels are strings (or NaN)
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lag1" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and latest known values for VALUE_COLUMNS.
# - For a row with (FP = t, period = P), the lag "<col>_lag1" is the last known
#   value of <col> for (FP = t-1, period = P) based on all PIT updates observed
#   up to (and including) the current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lag1" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, using:
      - FiscalPeriod (year), and
      - AnnPITValue_Period (period label) for matching the lag.
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values for VALUE_COLUMNS.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column, we collect the lag values row by row
    lag_values = {col: [] for col in VALUE_COLUMNS}

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_label_raw = row.get(PERIOD_COL, pd.NA)
        # Use None as key for missing labels to keep keys hashable and consistent
        period_label = None if pd.isna(period_label_raw) else str(period_label_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS
        for col in VALUE_COLUMNS:
            lag_val = None
            if pd.notna(fp):
                target_fp = fp - 1
                target_key = (target_fp, period_label)
                info = last_for_fp.get(target_key)
                if info is not None:
                    # info[1] is the dict of last known values for that target (FP, period_label)
                    lag_val = info[1].get(col)
            lag_values[col].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lag1" to the group DataFrame
    for col in VALUE_COLUMNS:
        group[f"{col}_lag1"] = lag_values[col]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
print([c for c in result.columns if any(c == v or c == f"{v}_lag1" for v in VALUE_COLUMNS)])

# Optional preview
cols_to_show = [
    "ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL
] + VALUE_COLUMNS + [f"{c}_lag1" for c in VALUE_COLUMNS]
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at', 'ca', 'cce', 'cl', 'std', 'itp', 'da', 'wc']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['at', 'wc', 'at_lag1', 'wc_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C02500770          Ars 1998-0

### Ag

In [57]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, a lagged column "<col>_lag1".
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lag1" is the most recent known value of <col> for
#      FiscalPeriod - 1 *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: 2020 Q1 will lag to 2019 Q1 (same period label), not to any
#      value from 2020.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Ag.txt"                 # Name of the input file to read
VALUE_COLUMNS = ["at"]         # Columns for which to compute the lag "<col>_lag1"

PERIOD_COL = "AnnPITValue_Period"    # NEW: period label used for lag (Q1, Q2, A, etc.)
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
# Make sure period labels are strings (or NaN)
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lag1" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and latest known values for VALUE_COLUMNS.
# - For a row with (FP = t, period = P), the lag "<col>_lag1" is the last known
#   value of <col> for (FP = t-1, period = P) based on all PIT updates observed
#   up to (and including) the current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lag1" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, using:
      - FiscalPeriod (year), and
      - AnnPITValue_Period (period label) for matching the lag.
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values for VALUE_COLUMNS.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column, we collect the lag values row by row
    lag_values = {col: [] for col in VALUE_COLUMNS}

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_label_raw = row.get(PERIOD_COL, pd.NA)
        # Use None as key for missing labels to keep keys hashable and consistent
        period_label = None if pd.isna(period_label_raw) else str(period_label_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS
        for col in VALUE_COLUMNS:
            lag_val = None
            if pd.notna(fp):
                target_fp = fp - 1
                target_key = (target_fp, period_label)
                info = last_for_fp.get(target_key)
                if info is not None:
                    # info[1] is the dict of last known values for that target (FP, period_label)
                    lag_val = info[1].get(col)
            lag_values[col].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lag1" to the group DataFrame
    for col in VALUE_COLUMNS:
        group[f"{col}_lag1"] = lag_values[col]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
print([c for c in result.columns if any(c == v or c == f"{v}_lag1" for v in VALUE_COLUMNS)])

# Optional preview
cols_to_show = [
    "ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL
] + VALUE_COLUMNS + [f"{c}_lag1" for c in VALUE_COLUMNS]
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1993                  A   
2   C02500770          Ars 1995-12-29          1994                  A   
3   C02500770          Ars 1996-05-03          1995                  A   
4   C02500770          Ars 1998-07-03          1996                  A   
5   C02500770          Ars 1998-07-03          1997                  A   
6   C02500770          Ars 1999-10-01          1998                  A   
7   C02500770          Ars 2000-05-19          1999                  A   
8   C02520200          Ars 1996-05-03          1989                  A   
9   C02520200          Ars 1996-05-03          1990                  A   
10  C02520200          Ars 1996-05-03          1991                  A   
11  C02520200          Ars 1996-05-03          199

### At

In [58]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, a lagged column "<col>_lag1".
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lag1" is the most recent known value of <col> for
#      FiscalPeriod - 1 *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: 2020 Q1 will lag to 2019 Q1 (same period label), not to any
#      value from 2020.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_At.txt"                 # Name of the input file to read
VALUE_COLUMNS = ["noa"]         # Columns for which to compute the lag "<col>_lag1"

PERIOD_COL = "AnnPITValue_Period"    # NEW: period label used for lag (Q1, Q2, A, etc.)
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
# Make sure period labels are strings (or NaN)
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lag1" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and latest known values for VALUE_COLUMNS.
# - For a row with (FP = t, period = P), the lag "<col>_lag1" is the last known
#   value of <col> for (FP = t-1, period = P) based on all PIT updates observed
#   up to (and including) the current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lag1" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, using:
      - FiscalPeriod (year), and
      - AnnPITValue_Period (period label) for matching the lag.
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values for VALUE_COLUMNS.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column, we collect the lag values row by row
    lag_values = {col: [] for col in VALUE_COLUMNS}

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_label_raw = row.get(PERIOD_COL, pd.NA)
        # Use None as key for missing labels to keep keys hashable and consistent
        period_label = None if pd.isna(period_label_raw) else str(period_label_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS
        for col in VALUE_COLUMNS:
            lag_val = None
            if pd.notna(fp):
                target_fp = fp - 1
                target_key = (target_fp, period_label)
                info = last_for_fp.get(target_key)
                if info is not None:
                    # info[1] is the dict of last known values for that target (FP, period_label)
                    lag_val = info[1].get(col)
            lag_values[col].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lag1" to the group DataFrame
    for col in VALUE_COLUMNS:
        group[f"{col}_lag1"] = lag_values[col]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
print([c for c in result.columns if any(c == v or c == f"{v}_lag1" for v in VALUE_COLUMNS)])

# Optional preview
cols_to_show = [
    "ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL
] + VALUE_COLUMNS + [f"{c}_lag1" for c in VALUE_COLUMNS]
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev', 'at', 'cce', 'ltd', 'mi', 'ps', 'ce', 'oa', 'ol', 'noa']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['noa', 'noa_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C02500770          Ars 1998-07-03          1

### Cat

In [59]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, a lagged column "<col>_lag1".
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lag1" is the most recent known value of <col> for
#      FiscalPeriod - 1 *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: 2020 Q1 will lag to 2019 Q1 (same period label), not to any
#      value from 2020.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Cat.txt"                 # Name of the input file to read
VALUE_COLUMNS = ["noa"]         # Columns for which to compute the lag "<col>_lag1"

PERIOD_COL = "AnnPITValue_Period"    # NEW: period label used for lag (Q1, Q2, A, etc.)
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
# Make sure period labels are strings (or NaN)
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lag1" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and latest known values for VALUE_COLUMNS.
# - For a row with (FP = t, period = P), the lag "<col>_lag1" is the last known
#   value of <col> for (FP = t-1, period = P) based on all PIT updates observed
#   up to (and including) the current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lag1" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, using:
      - FiscalPeriod (year), and
      - AnnPITValue_Period (period label) for matching the lag.
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values for VALUE_COLUMNS.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column, we collect the lag values row by row
    lag_values = {col: [] for col in VALUE_COLUMNS}

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_label_raw = row.get(PERIOD_COL, pd.NA)
        # Use None as key for missing labels to keep keys hashable and consistent
        period_label = None if pd.isna(period_label_raw) else str(period_label_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS
        for col in VALUE_COLUMNS:
            lag_val = None
            if pd.notna(fp):
                target_fp = fp - 1
                target_key = (target_fp, period_label)
                info = last_for_fp.get(target_key)
                if info is not None:
                    # info[1] is the dict of last known values for that target (FP, period_label)
                    lag_val = info[1].get(col)
            lag_values[col].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lag1" to the group DataFrame
    for col in VALUE_COLUMNS:
        group[f"{col}_lag1"] = lag_values[col]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
print([c for c in result.columns if any(c == v or c == f"{v}_lag1" for v in VALUE_COLUMNS)])

# Optional preview
cols_to_show = [
    "ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL
] + VALUE_COLUMNS + [f"{c}_lag1" for c in VALUE_COLUMNS]
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev', 'at', 'cce', 'ltd', 'mi', 'ps', 'ce', 'oa', 'ol', 'noa']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['noa', 'noa_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C02500770          Ars 1998-07-03          1

### Cpm

In [60]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, a lagged column "<col>_lag1".
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lag1" is the most recent known value of <col> for
#      FiscalPeriod - 1 *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: 2020 Q1 will lag to 2019 Q1 (same period label), not to any
#      value from 2020.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Cpm.txt"                 # Name of the input file to read
VALUE_COLUMNS = ["pm"]         # Columns for which to compute the lag "<col>_lag1"

PERIOD_COL = "AnnPITValue_Period"    # NEW: period label used for lag (Q1, Q2, A, etc.)
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
# Make sure period labels are strings (or NaN)
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lag1" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and latest known values for VALUE_COLUMNS.
# - For a row with (FP = t, period = P), the lag "<col>_lag1" is the last known
#   value of <col> for (FP = t-1, period = P) based on all PIT updates observed
#   up to (and including) the current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lag1" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, using:
      - FiscalPeriod (year), and
      - AnnPITValue_Period (period label) for matching the lag.
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values for VALUE_COLUMNS.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column, we collect the lag values row by row
    lag_values = {col: [] for col in VALUE_COLUMNS}

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_label_raw = row.get(PERIOD_COL, pd.NA)
        # Use None as key for missing labels to keep keys hashable and consistent
        period_label = None if pd.isna(period_label_raw) else str(period_label_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS
        for col in VALUE_COLUMNS:
            lag_val = None
            if pd.notna(fp):
                target_fp = fp - 1
                target_key = (target_fp, period_label)
                info = last_for_fp.get(target_key)
                if info is not None:
                    # info[1] is the dict of last known values for that target (FP, period_label)
                    lag_val = info[1].get(col)
            lag_values[col].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lag1" to the group DataFrame
    for col in VALUE_COLUMNS:
        group[f"{col}_lag1"] = lag_values[col]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
print([c for c in result.columns if any(c == v or c == f"{v}_lag1" for v in VALUE_COLUMNS)])

# Optional preview
cols_to_show = [
    "ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL
] + VALUE_COLUMNS + [f"{c}_lag1" for c in VALUE_COLUMNS]
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev', 'cogs', 'pm']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['pm', 'pm_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992               <NA>   
1   C02500770          Ars 1995-12-29          1993               <NA>   
2   C02500770          Ars 1995-12-29          1994               <NA>   
3   C02500770          Ars 1996-05-03          1995               <NA>   
4   C02500770          Ars 1998-07-03          1996               <NA>   
5   C02500770          Ars 1998-07-03          1997               <NA>   
7   C02500770          Ars 1999-10-01          1998               <NA>   
6   C02500770          Ars 1999-10-08          1997               <NA>   
8   C02500770          Ars 2000-05-19          1999               <NA>   
9   C02500770          Ars 2000-05-26          1999               <NA>   
10  C02520200          Ars 1996-05-03          1987               <NA>   
11  C02520200          Ars 1996-05-03          198

### Ec

In [61]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, a lagged column "<col>_lag1".
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lag1" is the most recent known value of <col> for
#      FiscalPeriod - 1 *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: 2020 Q1 will lag to 2019 Q1 (same period label), not to any
#      value from 2020.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Ec.txt"           # Name of the input file to read
VALUE_COLUMNS = ["eps"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 2
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod"]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# Logic:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each FiscalPeriod, the latest known PIT and the latest
#   known values for all columns in VALUE_COLUMNS.
# - For a row with FiscalPeriod = t, the lag column "<col>_lagk" is defined as
#   the last known value of <col> for FiscalPeriod = t-k based on all PIT
#   updates observed up to (and including) the current PIT Date.
#
# Important:
# - Multiple PIT updates for the same FiscalPeriod are handled.
# - Corrections to earlier FiscalPeriods update the historical state only
#   for subsequent PIT dates, not retroactively.

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS.
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each FiscalPeriod, the latest PIT and the known values
    # Structure:
    #   last_for_fp = {
    #       fp: (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect the lag values row by row
    # Structure:
    #   lag_values[col][k] = list of lagged values for that col and that k
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    info = last_for_fp.get(target_fp)
                    if info is not None:
                        # info[1] is the dict of last known values for that target FiscalPeriod
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current FiscalPeriod fp
        if pd.notna(fp) and pd.notna(pit):
            prev = last_for_fp.get(fp)

            # Previously stored values for this FiscalPeriod (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[fp] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod"]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'eps']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['eps', 'eps_lag1', 'eps_lag2']
           ID HistCurrency   PIT Date  FiscalPeriod      eps  eps_lag1  \
0   C02500770          Ars 1998-10-02          1992  0.87000       NaN   
3   C02500770          Ars 1998-10-02          1993  0.82000   0.87000   
6   C02500770          Ars 1998-10-02          1994  0.99024   0.82000   
8   C02500770          Ars 1998-10-02          1995 -0.39000   0.99024   
11  C02500770          Ars 1998-10-02          1996  0.35000  -0.39000   
13  C02500770          Ars 1998-10-02          1997 -0.57000   0.35000   
1   C02500770          Ars 1999-07-30          1992  0.88000       NaN   
4   C02500770          Ars 1999-07-30          1993  0.83000   0.88000   
14  C02500770          Ars 1999-07-30          1997 -0.58000   0.35000   
2   C02500770          Ars 1999-10-01          1992  0.87691       NaN   
5   C02500770          Ars 1999-10-01          1993  0.82611   0.87691   
7   C02500770          Ars 1999-10-0

### Es (Yearly Adaption)

In [62]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "rev_lag1" comes from 2019 Q1, not 2019 A.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Es.txt"            # Name of the input file to read
VALUE_COLUMNS = ["eps"]               # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 4

# Period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# New logic:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FP = t, period = P), "<col>_lagk" comes from (FP = t-k, period = P)
#   based on all PIT updates up to that row.
#
# If AnnPITValue_Period is missing, we treat the period label as None and thus
# fall back to the old "by FiscalPeriod only" behaviour for that row.

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Copy so we don't mutate previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'eps']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['eps', 'eps_lag1', 'eps_lag2', 'eps_lag3', 'eps_lag4']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1998-10-02          1992               <NA>   
3   C02500770          Ars 1998-10-02          1993               <NA>   
6   C02500770          Ars 1998-10-02          1994               <NA>   
8   C02500770          Ars 1998-10-02          1995               <NA>   
11  C02500770          Ars 1998-10-02          1996               <NA>   
13  C02500770          Ars 1998-10-02          1997               <NA>   
1   C02500770          Ars 1999-07-30          1992               <NA>   
4   C02500770          Ars 1999-07-30          1993               <NA>   
14  C02500770          Ars 1999-07-30          1997               <NA>   
2   C02500770          Ars 1999-10-01          1992               <NA>   
5   C02500770          Ars 1999-10-01          1993               <NA>   
7   C0250077

### Ig

In [63]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Ig.txt"            # Name of the input file to read
VALUE_COLUMNS = ["inv", "at"]         # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at', 'inv']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['inv', 'inv_lag1', 'at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1993                  A   
2   C02500770          Ars 1995-12-29          1994                  A   
3   C02500770          Ars 1996-05-03          1995                  A   
4   C02500770          Ars 1998-07-03          1996                  A   
5   C02500770          Ars 1998-07-03          1997                  A   
6   C02500770          Ars 1999-10-01          1998                  A   
7   C02500770          Ars 2000-05-19          1999                  A   
8   C02500770          Ars 2000-05-26          1999                  A   
9   C02520200          Ars 1996-05-03          1987                  A   
10  C02520200          Ars 1996-05-03          1988                  A   
11  C02520200          Ars 1996

### Inv

In [64]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Inv.txt"           # Name of the input file to read
VALUE_COLUMNS = ["ce"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 3

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev', 'capex', 'ce']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['ce', 'ce_lag1', 'ce_lag2', 'ce_lag3']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992               <NA>   
1   C02500770          Ars 1995-12-29          1993               <NA>   
2   C02500770          Ars 1995-12-29          1994               <NA>   
3   C02500770          Ars 1996-05-03          1995               <NA>   
4   C02500770          Ars 1998-07-03          1996               <NA>   
5   C02500770          Ars 1998-07-03          1997               <NA>   
6   C02500770          Ars 1999-10-01          1998               <NA>   
7   C02500770          Ars 2000-05-19          1999               <NA>   
8   C02500770          Ars 2000-05-26          1999               <NA>   
9   C02520200          Ars 1996-05-03          1989               <NA>   
10  C02520200          Ars 1996-05-03          1990               <NA>   
11  C02520200          Ars 1

### Ltg

In [65]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Ltg.txt"           # Name of the input file to read
VALUE_COLUMNS = ["noa", "at", "wc"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at', 'ca', 'cce', 'cl', 'std', 'itp', 'da', 'ar', 'inv', 'oca', 'ppe', 'oa', 'ap', 'ocl', 'ol', 'noa', 'wc']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['noa', 'noa_lag1', 'at', 'at_lag1', 'wc', 'wc_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C02500770 

### Nca

In [66]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Nca.txt"           # Name of the input file to read
VALUE_COLUMNS = ["oa", "at"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'IVAO_oi', 'IVAO_iac', 'IVAO_ltr', 'IVAO_isdfl', 'IVAO_uol', 'at', 'ca', 'lt', 'std', 'ltd', 'ivao', 'oa']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['oa', 'oa_lag1', 'at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1993                  A   
2   C02500770          Ars 1995-12-29          1994                  A   
3   C02500770          Ars 1996-05-03          1995                  A   
4   C02500770          Ars 1998-07-03          1996                  A   
5   C02500770          Ars 1998-07-03          1997                  A   
6   C02500770          Ars 1999-10-01          1998                  A   
7   C02500770          Ars 2000-05-19          1999                  A   
8   C02500770          Ars 2000-05-26          1999                  A   
9   C02520200          Ars 1996-05-03          1987                  A   
10  C02520200          Ars 1996-05-03          1988                  A   
11  C02520200          Ars 1996-0

### Noa

In [67]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================
# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Noa.txt"           # Name of the input file to read
VALUE_COLUMNS = ["at"]            # Columns for which to compute the lags "<col>_lagk"

## Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at', 'cce', 'ltd', 'mi', 'ps', 'ce', 'oa', 'ol', 'noa']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1993                  A   
2   C02500770          Ars 1995-12-29          1994                  A   
3   C02500770          Ars 1996-05-03          1995                  A   
4   C02500770          Ars 1998-07-03          1996                  A   
5   C02500770          Ars 1998-07-03          1997                  A   
6   C02500770          Ars 1999-10-01          1998                  A   
7   C02500770          Ars 2000-05-19          1999                  A   
8   C02520200          Ars 1996-05-03          1987                  A   
9   C02520200          Ars 1996-05-03          1988                  A   
10  C02520200          Ars 1996-05-03          1989                  A   
11  C02520200          Ars 1996-05-03          199

### Nwc

In [68]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Nwc.txt"           # Name of the input file to read
VALUE_COLUMNS = ["nwc", "at"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at', 'ca', 'cce', 'cl', 'std', 'nwc']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['nwc', 'nwc_lag1', 'at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1993                  A   
2   C02500770          Ars 1995-12-29          1994                  A   
3   C02500770          Ars 1996-05-03          1995                  A   
4   C02500770          Ars 1998-07-03          1996                  A   
5   C02500770          Ars 1998-07-03          1997                  A   
6   C02500770          Ars 1999-10-01          1998                  A   
7   C02500770          Ars 2000-05-19          1999                  A   
8   C02520200          Ars 1996-05-03          1987                  A   
9   C02520200          Ars 1996-05-03          1988                  A   
10  C02520200          Ars 1996-05-03          1989                  A   
11  C02520200          Ars 1996

### Osc

In [69]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Osc.txt"           # Name of the input file to read
VALUE_COLUMNS = ["ni_extra"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at', 'tl', 'ca', 'cl', 'ni_extra', 'IFO_ffo', 'IFO_ei', 'IFO_dofa', 'IFO_ffooa', 'ifo']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['ni_extra', 'ni_extra_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
11  C02500770          Ars 1998-07-03          1997                  A   
12  C02500770          Ars 1998-07-03 

### Pro

In [70]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Pro.txt"           # Name of the input file to read
VALUE_COLUMNS = ["at"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'ni_eps', 'at']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C02500770          Ars 1998-07-03          199

### Rs (Yearly Adaption)

In [71]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "rev_lag1" comes from 2019 Q1, not 2019 A.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Rs.txt"            # Name of the input file to read
VALUE_COLUMNS = ["rev"]               # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 4

# Period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# New logic:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FP = t, period = P), "<col>_lagk" comes from (FP = t-k, period = P)
#   based on all PIT updates up to that row.
#
# If AnnPITValue_Period is missing, we treat the period label as None and thus
# fall back to the old "by FiscalPeriod only" behaviour for that row.

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Copy so we don't mutate previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['rev', 'rev_lag1', 'rev_lag2', 'rev_lag3', 'rev_lag4']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992               <NA>   
1   C02500770          Ars 1995-12-29          1993               <NA>   
2   C02500770          Ars 1995-12-29          1994               <NA>   
3   C02500770          Ars 1996-05-03          1995               <NA>   
4   C02500770          Ars 1998-07-03          1996               <NA>   
5   C02500770          Ars 1998-07-03          1997               <NA>   
6   C02500770          Ars 1999-10-01          1998               <NA>   
7   C02500770          Ars 2000-05-19          1999               <NA>   
8   C02500770          Ars 2000-05-26          1999               <NA>   
9   C02520200          Ars 1996-05-03          1989               <NA>   
10  C02520200          Ars 1996-05-03          1990               <NA>   
11  C0252020

### Sg

In [72]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Sg.txt"           # Name of the input file to read
VALUE_COLUMNS = ["be"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'ce', 'dt', 'be']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['be', 'be_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1993                  A   
2   C02500770          Ars 1995-12-29          1994                  A   
3   C02500770          Ars 1996-05-03          1995                  A   
4   C02500770          Ars 1998-07-03          1996                  A   
5   C02500770          Ars 1998-07-03          1997                  A   
6   C02500770          Ars 1999-10-01          1998                  A   
7   C02500770          Ars 2000-05-19          1999                  A   
8   C02520200          Ars 1996-05-03          1987                  A   
9   C02520200          Ars 1996-05-03          1988                  A   
10  C02520200          Ars 1996-05-03          1989                  A   
11  C02520200          Ars 1996-05-03          199

### Sli

In [73]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Sli.txt"           # Name of the input file to read
VALUE_COLUMNS = ["rev", "inv"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 2

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev', 'inv']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['rev', 'rev_lag1', 'rev_lag2', 'inv', 'inv_lag1', 'inv_lag2']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C

### Slx

In [74]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Slx.txt"           # Name of the input file to read
VALUE_COLUMNS = ["rev", "sga"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 2

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev', 'sga']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['rev', 'rev_lag1', 'rev_lag2', 'sga', 'sga_lag1', 'sga_lag2']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992               <NA>   
1   C02500770          Ars 1995-12-29          1993               <NA>   
2   C02500770          Ars 1995-12-29          1994               <NA>   
3   C02500770          Ars 1996-05-03          1995               <NA>   
4   C02500770          Ars 1998-07-03          1996               <NA>   
5   C02500770          Ars 1998-07-03          1997               <NA>   
7   C02500770          Ars 1999-10-01          1998               <NA>   
6   C02500770          Ars 1999-10-08          1997               <NA>   
8   C02500770          Ars 2000-05-19          1999               <NA>   
9   C02500770          Ars 2000-05-26          1999               <NA>   
10  C02520200          Ars 1996-05-03          1988               <NA>   
11  C

### Txf

In [75]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Txf.txt"           # Name of the input file to read
VALUE_COLUMNS = ["at"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 2

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'socaps', 'pocaps', 'div', 'diss', 'dred', 'at']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['at', 'at_lag1', 'at_lag2']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1996-05-31          1995               <NA>   
9   C02500770          Ars 1998-07-03          1996                  A   
10  C02500770          Ars 1998-07-03          1996               <NA>   
11  C02500770          Ars 1998-07-03  

## Calculate Anomalies

### Acc

In [76]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Acc.txt"               # <-- Input file
OUTPUT_FILE_NAME = "ACC.txt"  # <-- Output file

NEW_COLUMN = "acc"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(wc - wc_lag1) / (0.5 * (at + at_lag1))"
FORMULA_COLUMNS = ["wc", "wc_lag1", "at", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'acc' created based on formula:
    (wc - wc_lag1) / (0.5 * (at + at_lag1))
Rows used (all required columns non-null): 527155
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/ACC.txt
           ID    PIT Date HistCurrency  FiscalPeriod       acc
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993 -0.099280
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994  0.170035
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995  0.011679
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996 -0.079990
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997  0.009323
11  C02500770  1998-0

### Ag

In [77]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Ag.txt"               # <-- Input file
OUTPUT_FILE_NAME = "AG.txt"  # <-- Output file

NEW_COLUMN = "ag"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(at - at_lag1) / at_lag1"
FORMULA_COLUMNS = ["at", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'ag' created based on formula:
    (at - at_lag1) / at_lag1
Rows used (all required columns non-null): 2074894
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/AG.txt
           ID    PIT Date HistCurrency  FiscalPeriod         ag
0   C02500770  1995-12-29          Ars          1992        NaN
1   C02500770  1995-12-29          Ars          1993   0.168725
2   C02500770  1995-12-29          Ars          1994   0.152593
3   C02500770  1996-05-03          Ars          1995  -0.106907
4   C02500770  1998-07-03          Ars          1996  -0.029679
5   C02500770  1998-07-03          Ars          1997  -0.032119
6   C02500770  1999-10-01          Ars          1998  -0.100152
7   C02500770  2000-05-19          Ars          1999  -0.141925
8   C02520200  1996-05-03          Ars          1989        NaN
9   C02520200  1996-05-03          Ars          1990  23.625648
10  C02520200  1996-05-03          Ars          1991   0.961409
11  C02520200  1996-05-03

### At

In [78]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_At.txt"               # <-- Input file
OUTPUT_FILE_NAME = "AT.txt"  # <-- Output file

NEW_COLUMN = "at"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "rev / (0.5 * (noa + noa_lag1))"
FORMULA_COLUMNS = ["rev", "noa_lag1", "noa"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'at' created based on formula:
    rev / (0.5 * (noa + noa_lag1))
Rows used (all required columns non-null): 816843
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/AT.txt
           ID    PIT Date HistCurrency  FiscalPeriod        at
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993  8.717688
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994       NaN
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995       NaN
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996       NaN
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997       NaN
11  C02500770  1998-07-03       

### Cat

#### 1) Calculate AT

In [79]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Cat.txt"               # <-- Input file
OUTPUT_FILE_NAME = "processed_data_Cat.txt"  # <-- Output file

NEW_COLUMN = "at"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "rev / (0.5 * (noa + noa_lag1))"
FORMULA_COLUMNS = ["rev", "noa_lag1", "noa"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'at' created based on formula:
    rev / (0.5 * (noa + noa_lag1))
Rows used (all required columns non-null): 816843
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Cat.txt
           ID    PIT Date HistCurrency  FiscalPeriod AnnPITValue_Period  \
0   C02500770  1995-12-29          Ars          1992                  A   
1   C02500770  1995-12-29          Ars          1992                NaN   
2   C02500770  1995-12-29          Ars          1993                  A   
3   C02500770  1995-12-29          Ars          1993                NaN   
4   C02500770  1995-12-29          Ars          1994                  A   
5   C02500770  1995-12-29          Ars          1994                NaN   
6   C02500770  1996-05-03          Ars          1995                  A   
7   C02500770  1996-05-03          Ars          1995                NaN   
8   C02500770  1998-07-03          Ars          1996                  A   
9   C02500770  1998-07

#### 2) Get Lagged AT

In [80]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "processed_data_Cat.txt"                 # Input file
OUTPUT_FILE_NAME = "processed_data_Cat.txt" # <-- CONFIGURABLE OUTPUT FILE

VALUE_COLUMNS = ["at"]                # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 1

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'rev', 'at', 'cce', 'ltd', 'mi', 'ps', 'ce', 'oa', 'ol', 'noa', 'noa_lag1']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C02500770          Ars 1998-07-03          199

#### 3) Calculate CAT

In [81]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Cat.txt"               # <-- Input file
OUTPUT_FILE_NAME = "CAT.txt"  # <-- Output file

NEW_COLUMN = "cat"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "at - at_lag1"
FORMULA_COLUMNS = ["at", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'cat' created based on formula:
    at - at_lag1
Rows used (all required columns non-null): 712657
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/CAT.txt
           ID    PIT Date HistCurrency  FiscalPeriod  cat
0   C02500770  1995-12-29          Ars          1992  NaN
1   C02500770  1995-12-29          Ars          1992  NaN
2   C02500770  1995-12-29          Ars          1993  NaN
3   C02500770  1995-12-29          Ars          1993  NaN
4   C02500770  1995-12-29          Ars          1994  NaN
5   C02500770  1995-12-29          Ars          1994  NaN
6   C02500770  1996-05-03          Ars          1995  NaN
7   C02500770  1996-05-03          Ars          1995  NaN
8   C02500770  1998-07-03          Ars          1996  NaN
9   C02500770  1998-07-03          Ars          1996  NaN
10  C02500770  1998-07-03          Ars          1997  NaN
11  C02500770  1998-07-03          Ars          1997  NaN
12  C02500770  1999-10-01          Ars          19

### Cpm

In [82]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Cpm.txt"               # <-- Input file
OUTPUT_FILE_NAME = "CPM.txt"  # <-- Output file

NEW_COLUMN = "cpm"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "pm - pm_lag1"
FORMULA_COLUMNS = ["pm", "pm_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'cpm' created based on formula:
    pm - pm_lag1
Rows used (all required columns non-null): 1495014
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/CPM.txt
           ID    PIT Date HistCurrency  FiscalPeriod       cpm
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993 -0.014597
2   C02500770  1995-12-29          Ars          1994 -0.026003
3   C02500770  1996-05-03          Ars          1995 -0.008383
4   C02500770  1998-07-03          Ars          1996  0.051027
5   C02500770  1998-07-03          Ars          1997 -0.006505
6   C02500770  1999-10-01          Ars          1998  0.089222
7   C02500770  1999-10-08          Ars          1997  0.089205
8   C02500770  2000-05-19          Ars          1999       NaN
9   C02500770  2000-05-26          Ars          1999  0.157138
10  C02520200  1996-05-03          Ars          1987       NaN
11  C02520200  1996-05-03          Ars         

### Ec

#### 1) Calculate EG

In [83]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where:
#      - ALL columns listed in FORMULA_COLUMNS are non-null, AND
#      - eps_y and eps_y_lag1 are either both negative or both positive.
#    Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)  # assumes Temp_file_path_A is defined elsewhere
SEP = "|"

INPUT_FILE = "processed_data_Ec.txt"               # <-- Input file
OUTPUT_FILE_NAME = "processed_data_Ec.txt"             # <-- Output file

NEW_COLUMN = "eg"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(eps - eps_lag1) / (0.5 * (eps_lag1 + eps_lag2))"
FORMULA_COLUMNS = ["eps", "eps_lag1", "eps_lag2"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 4a) BUILD MASK WHERE eps_y AND eps_y_lag1 HAVE THE SAME SIGN (both + or both -)
mask_same_sign = (df_out["eps"] * df_out["eps_lag1"]) > 0

# 4b) FINAL MASK: NON-NULL AND SAME SIGN
final_mask = mask_all_present & mask_same_sign

# 5) APPLY FORMULA ONLY WHERE final_mask IS TRUE
df_out.loc[final_mask, NEW_COLUMN] = df_out.loc[final_mask].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null & same sign):", final_mask.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'eg' created based on formula:
    (eps - eps_lag1) / (0.5 * (eps_lag1 + eps_lag2))
Rows used (all required columns non-null & same sign): 1376979
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Ec.txt
           ID    PIT Date HistCurrency  FiscalPeriod  AnnPITValue_Period  \
0   C02500770  1998-10-02          Ars          1992                 NaN   
1   C02500770  1998-10-02          Ars          1993                 NaN   
2   C02500770  1998-10-02          Ars          1994                 NaN   
3   C02500770  1998-10-02          Ars          1995                 NaN   
4   C02500770  1998-10-02          Ars          1996                 NaN   
5   C02500770  1998-10-02          Ars          1997                 NaN   
6   C02500770  1999-07-30          Ars          1992                 NaN   
7   C02500770  1999-07-30          Ars          1993                 NaN   
8   C02500770  1999-07-30          Ars          1997      

#### 2) Get Lagged EG

In [84]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, lagged columns "<col>_lagk"
#    for k = 1..MAX_LAG_PERIODS.
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lagk" is the most recent known value of <col> for
#      FiscalPeriod - k *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: for a row 2020 Q1, "<col>_lag1" is the value from 2019 Q1,
#      not from 2019 A or 2020 anything.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "processed_data_Ec.txt"           # Name of the input file to read
OUTPUT_FILE_NAME = "processed_data_Ec.txt" # <-- CONFIGURABLE OUTPUT FILE

VALUE_COLUMNS = ["eg"]            # Columns for which to compute the lags "<col>_lagk"

# Number of prior FiscalPeriods to compute lags for:
#   1 -> only t-1
#   2 -> t-1 and t-2
#   ...
MAX_LAG_PERIODS = 4

# NEW: period label column (Q1, Q2, A, etc.)
PERIOD_COL = "AnnPITValue_Period"
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lagk" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and the latest known values for all columns in VALUE_COLUMNS.
# - For a row with (FiscalPeriod = t, period = P), the lag column "<col>_lagk"
#   is defined as the last known value of <col> for (FiscalPeriod = t-k,
#   period = P) based on all PIT updates observed up to (and including) the
#   current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lagk" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, for k=1..MAX_LAG_PERIODS,
    keyed by (FiscalPeriod, AnnPITValue_Period).
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column and lag k, we collect lag values row by row
    lag_values = {
        col: {k: [] for k in range(1, MAX_LAG_PERIODS + 1)}
        for col in VALUE_COLUMNS
    }

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_raw = row.get(PERIOD_COL, pd.NA)
        period_label = None if pd.isna(period_raw) else str(period_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS and each k
        for col in VALUE_COLUMNS:
            for k in range(1, MAX_LAG_PERIODS + 1):
                lag_val = None
                if pd.notna(fp):
                    target_fp = fp - k
                    target_key = (target_fp, period_label)
                    info = last_for_fp.get(target_key)
                    if info is not None:
                        # info[1] is the dict of last known values for that (target_fp, period_label)
                        lag_val = info[1].get(col)
                lag_values[col][k].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lagk" to the group DataFrame
    for col in VALUE_COLUMNS:
        for k in range(1, MAX_LAG_PERIODS + 1):
            group[f"{col}_lag{k}"] = lag_values[col][k]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
created_cols = []
for v in VALUE_COLUMNS:
    created_cols.append(v)
    for k in range(1, MAX_LAG_PERIODS + 1):
        created_cols.append(f"{v}_lag{k}")
print(created_cols)

# Optional preview
cols_to_show = (
    ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL]
    + VALUE_COLUMNS
    + [f"{c}_lag{k}" for c in VALUE_COLUMNS for k in range(1, MAX_LAG_PERIODS + 1)]
)
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'eps', 'eps_lag1', 'eps_lag2', 'eg']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['eg', 'eg_lag1', 'eg_lag2', 'eg_lag3', 'eg_lag4']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1998-10-02          1992               <NA>   
1   C02500770          Ars 1998-10-02          1993               <NA>   
2   C02500770          Ars 1998-10-02          1994               <NA>   
3   C02500770          Ars 1998-10-02          1995               <NA>   
4   C02500770          Ars 1998-10-02          1996               <NA>   
5   C02500770          Ars 1998-10-02          1997               <NA>   
6   C02500770          Ars 1999-07-30          1992               <NA>   
7   C02500770          Ars 1999-07-30          1993               <NA>   
8   C02500770          Ars 1999-07-30          1997               <NA>   
9   C02500770          Ars 1999-10-01          1992               <NA>   
10  C02500770          Ars 1999-10-01          1993               <NA>   
11  C02500770    

#### 3) Calculate EC

In [85]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Ec.txt"               # <-- Input file
OUTPUT_FILE_NAME = "EC.txt"  # <-- Output file

NEW_COLUMN = "ec"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "0.2 * (eg + eg_lag1 + eg_lag2 + eg_lag3 + eg_lag4)"
FORMULA_COLUMNS = ["eg", "eg_lag1", "eg_lag2", "eg_lag3", "eg_lag4"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'ec' created based on formula:
    0.2 * (eg + eg_lag1 + eg_lag2 + eg_lag3 + eg_lag4)
Rows used (all required columns non-null): 640521
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/EC.txt
           ID    PIT Date HistCurrency  FiscalPeriod  ec
0   C02500770  1998-10-02          Ars          1992 NaN
1   C02500770  1998-10-02          Ars          1993 NaN
2   C02500770  1998-10-02          Ars          1994 NaN
3   C02500770  1998-10-02          Ars          1995 NaN
4   C02500770  1998-10-02          Ars          1996 NaN
5   C02500770  1998-10-02          Ars          1997 NaN
6   C02500770  1999-07-30          Ars          1992 NaN
7   C02500770  1999-07-30          Ars          1993 NaN
8   C02500770  1999-07-30          Ars          1997 NaN
9   C02500770  1999-10-01          Ars          1992 NaN
10  C02500770  1999-10-01          Ars          1993 NaN
11  C02500770  1999-10-01          Ars          1994 NaN
12  C02500770  1999-10-01  

### Es

#### Drift

In [86]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN = "Drift") representing the expected
#    annual EPS change ("drift") at time t.
#
# Definition of Drift:
# --------------------
# Drift_t is defined as the mean of the prior three year-to-year EPS changes,
# using only information available strictly before time t:
#
#   Drift_t = [ (eps_{t-1} - eps_{t-2})
#             + (eps_{t-2} - eps_{t-3})
#             + (eps_{t-3} - eps_{t-4}) ] / 3
#
# Column conventions:
#   eps        = EPS at time t
#   eps_lag1   = EPS at time t-1
#   eps_lag2   = EPS at time t-2
#   eps_lag3   = EPS at time t-3
#   eps_lag4   = EPS at time t-4
#
# Implementation details:
# -----------------------
# - Drift is computed ONLY for rows where eps_lag1 .. eps_lag4 are all non-null.
# - If any required lag is missing, Drift is set to NaN.
# - No partial averages, fallbacks, or tolerance logic are applied.
# - The updated DataFrame overwrites the original file on disk.
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Es.txt"   # Input file name
NEW_COLUMN = "Drift"                   # Name of the computed column

# Required EPS level columns (see summary for definitions)
REQUIRED_COLUMNS = ["eps_lag1", "eps_lag2", "eps_lag3", "eps_lag4"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_cols = [c for c in REQUIRED_COLUMNS if c not in df.columns]
if missing_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for drift calculation: {missing_cols}"
    )

# 3) INITIALIZE OUTPUT DATAFRAME AND NEW COLUMN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK: ALL REQUIRED LAGS MUST BE NON-NULL
mask_all_present = df_out[REQUIRED_COLUMNS].notna().all(axis=1)

# 5) COMPUTE PRIOR EPS CHANGES
d1 = df_out["eps_lag1"] - df_out["eps_lag2"]  # eps_{t-1} - eps_{t-2}
d2 = df_out["eps_lag2"] - df_out["eps_lag3"]  # eps_{t-2} - eps_{t-3}
d3 = df_out["eps_lag3"] - df_out["eps_lag4"]  # eps_{t-3} - eps_{t-4}

# 6) COMPUTE DRIFT (ONLY WHERE ALL INPUTS ARE PRESENT)
df_out.loc[mask_all_present, NEW_COLUMN] = (
    d1 + d2 + d3
).loc[mask_all_present] / 3.0

# 7) SAVE RESULT (OVERWRITE INPUT FILE)
output_path = OUTPUT_DIR / INPUT_FILE
df_out.to_csv(output_path, sep=SEP, index=False)

# 8) LOG SUMMARY
print(f"New column '{NEW_COLUMN}' created as mean of prior three EPS changes.")
print("Rows with all required non-null lags used:", int(mask_all_present.sum()))
print("Result was saved to:", output_path)
print(df_out[REQUIRED_COLUMNS + [NEW_COLUMN]].head(20))


New column 'Drift' created as mean of prior three EPS changes.
Rows with all required non-null lags used: 1381544
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Es.txt
    eps_lag1  eps_lag2  eps_lag3  eps_lag4     Drift
0        NaN       NaN       NaN       NaN       NaN
1    0.87000       NaN       NaN       NaN       NaN
2    0.82000   0.87000       NaN       NaN       NaN
3    0.99024   0.82000   0.87000       NaN       NaN
4   -0.39000   0.99024   0.82000   0.87000 -0.420000
5    0.35000  -0.39000   0.99024   0.82000 -0.156667
6        NaN       NaN       NaN       NaN       NaN
7    0.88000       NaN       NaN       NaN       NaN
8    0.35000  -0.39000   0.99024   0.83000 -0.160000
9        NaN       NaN       NaN       NaN       NaN
10   0.87691       NaN       NaN       NaN       NaN
11   0.82611   0.87691       NaN       NaN       NaN
12   0.99024   0.82611   0.87691       NaN       NaN
13  -0.38913   0.99024   0.82611   0.87691 -

#### sd

In [87]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN = "sd") representing the SAMPLE standard
#    deviation (ddof=1) of the prior-three EPS CHANGE deviations from Drift.
#
# Context / consistency with Drift:
# ---------------------------------
# You defined:
#   eps        = EPS at time t
#   eps_lag1   = EPS at time t-1
#   eps_lag2   = EPS at time t-2
#   eps_lag3   = EPS at time t-3
#   eps_lag4   = EPS at time t-4
#
# Drift_t is assumed to already exist in the file and is defined as:
#   Drift_t = mean( (eps_{t-1}-eps_{t-2}), (eps_{t-2}-eps_{t-3}), (eps_{t-3}-eps_{t-4}) )
#
# SD definition:
# --------------
# Let the three prior changes be:
#   d1 = eps_lag1 - eps_lag2   # (t-1) - (t-2)
#   d2 = eps_lag2 - eps_lag3   # (t-2) - (t-3)
#   d3 = eps_lag3 - eps_lag4   # (t-3) - (t-4)
#
# Then:
#   sd_t = sample_std( [d1 - Drift_t, d2 - Drift_t, d3 - Drift_t] )  with ddof=1
#
# Implementation details:
# -----------------------
# - sd is computed ONLY for rows where eps_lag1..eps_lag4 are all non-null AND
#   Drift is non-null.
# - If any required input is missing, sd is set to NaN.
# - No partial computations, fallbacks, or tolerance logic are applied.
# - The updated DataFrame overwrites the original file on disk.
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Es.txt"  # File that already contains 'Drift'
NEW_COLUMN = "sd"                     # Name of the new computed column

# Required EPS level columns to build the three prior changes
LEVEL_COLUMNS = ["eps_lag1", "eps_lag2", "eps_lag3", "eps_lag4"]
DRIFT_COL = "Drift"
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing = [c for c in (LEVEL_COLUMNS + [DRIFT_COL]) if c not in df.columns]
if missing:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for sd calculation: {missing}"
    )

# 3) INITIALIZE OUTPUT DATAFRAME AND NEW COLUMN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) ENSURE NUMERIC TYPES (COERCE NON-NUMERIC TO NaN)
df_out[DRIFT_COL] = pd.to_numeric(df_out[DRIFT_COL], errors="coerce")
for c in LEVEL_COLUMNS:
    df_out[c] = pd.to_numeric(df_out[c], errors="coerce")

# 5) BUILD MASK: ALL REQUIRED LAGS MUST BE NON-NULL AND Drift MUST BE NON-NULL
mask_all_present = df_out[LEVEL_COLUMNS].notna().all(axis=1) & df_out[DRIFT_COL].notna()

# 6) COMPUTE THE THREE PRIOR EPS CHANGES (d1, d2, d3)
d1 = df_out["eps_lag1"] - df_out["eps_lag2"]  # eps_{t-1} - eps_{t-2}
d2 = df_out["eps_lag2"] - df_out["eps_lag3"]  # eps_{t-2} - eps_{t-3}
d3 = df_out["eps_lag3"] - df_out["eps_lag4"]  # eps_{t-3} - eps_{t-4}

# 7) COMPUTE SAMPLE STANDARD DEVIATION OF (di - Drift), i in {1,2,3}
#    We form a 3-column DataFrame of deviations and then take row-wise std(ddof=1).
dev = pd.DataFrame(
    {
        "dev_d1": d1 - df_out[DRIFT_COL],
        "dev_d2": d2 - df_out[DRIFT_COL],
        "dev_d3": d3 - df_out[DRIFT_COL],
    },
    index=df_out.index,
)

df_out.loc[mask_all_present, NEW_COLUMN] = dev.loc[mask_all_present].std(axis=1, ddof=1)

# 8) SAVE RESULT (OVERWRITE INPUT FILE)
output_path = OUTPUT_DIR / INPUT_FILE
df_out.to_csv(output_path, sep=SEP, index=False)

# 9) LOG SUMMARY
print(
    f"New column '{NEW_COLUMN}' created as sample std (ddof=1) of [(d1-Drift),(d2-Drift),(d3-Drift)], "
    f"where d1=eps_lag1-eps_lag2, d2=eps_lag2-eps_lag3, d3=eps_lag3-eps_lag4."
)
print("Rows with all required non-null lags and non-null Drift used:", int(mask_all_present.sum()))
print("Result was saved to:", output_path)
print(df_out[LEVEL_COLUMNS + [DRIFT_COL, NEW_COLUMN]].head(20))


New column 'sd' created as sample std (ddof=1) of [(d1-Drift),(d2-Drift),(d3-Drift)], where d1=eps_lag1-eps_lag2, d2=eps_lag2-eps_lag3, d3=eps_lag3-eps_lag4.
Rows with all required non-null lags and non-null Drift used: 1381544
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Es.txt
    eps_lag1  eps_lag2  eps_lag3  eps_lag4     Drift        sd
0        NaN       NaN       NaN       NaN       NaN       NaN
1    0.87000       NaN       NaN       NaN       NaN       NaN
2    0.82000   0.87000       NaN       NaN       NaN       NaN
3    0.99024   0.82000   0.87000       NaN       NaN       NaN
4   -0.39000   0.99024   0.82000   0.87000 -0.420000  0.838852
5    0.35000  -0.39000   0.99024   0.82000 -0.156667  1.097272
6        NaN       NaN       NaN       NaN       NaN       NaN
7    0.88000       NaN       NaN       NaN       NaN       NaN
8    0.35000  -0.39000   0.99024   0.83000 -0.160000  1.095796
9        NaN       NaN       NaN       NaN

#### Es

In [88]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN = "es") as a standardized EPS surprise:
#
#       es_t = ( (eps_t - eps_{t-1}) - Drift_t ) / sd_t
#
#    where:
#      - eps        = eps_t
#      - eps_lag1   = eps_{t-1}
#      - Drift      = mean of prior EPS changes (already computed in the file)
#      - sd         = sample std (ddof=1) of prior-change deviations from Drift
#                    (already computed in the file)
#
# 3) The formula is applied ONLY when ALL required inputs are non-null AND
#    sd is non-zero (checking 5 decimal places). Otherwise, es is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) The formula is computed using DataFrame.eval (engine="python").
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The result is written to disk under OUTPUT_FILE_NAME.
#
# Notes:
# - No partial computations are performed: if any input is missing, output is NaN.
# - An explicit guard is applied to avoid division by zero (sd rounded to 5 decimals == 0).
# - If the expression evaluates to non-finite values (inf/-inf), they are set to NaN.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Es.txt"      # Input file (must already contain Drift and sd)
OUTPUT_FILE_NAME = "ES.txt"               # Output file

NEW_COLUMN = "es"                         # Name of the computed column
FORMULA_EXPRESSION = "((eps - eps_lag1) - Drift) / sd"
FORMULA_COLUMNS = ["eps", "eps_lag1", "Drift", "sd"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) ENSURE NUMERIC TYPES (COERCE NON-NUMERIC TO NaN)
for c in FORMULA_COLUMNS:
    df_out[c] = pd.to_numeric(df_out[c], errors="coerce")

# 5) BUILD MASK WHERE:
#    - all required inputs are non-null, AND
#    - sd is non-zero (TREATING ANYTHING < 0.00001 AS ZERO)
#    We round 'sd' to 5 decimals; if that is 0, we treat it as 0.
mask_all_present = (
    df_out[FORMULA_COLUMNS].notna().all(axis=1) 
    & (df_out["sd"].round(5) != 0)
)

# 6) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) SANITIZE NON-FINITE RESULTS (inf/-inf) -> NaN
#    This can happen if sd is extremely close to zero or due to numeric issues.
df_out.loc[~np.isfinite(df_out[NEW_COLUMN]), NEW_COLUMN] = np.nan

# 8) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 9) DEFINE OUTPUT FILE PATH (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 10) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

# 11) PRINT SUMMARY
print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (inputs present and sd != 0 at 5 decimals):", int(mask_all_present.sum()))
print("Result was saved to:", formula_output_path)
print(df_out.head(20))

New column 'es' created based on formula:
    ((eps - eps_lag1) - Drift) / sd
Rows used (inputs present and sd != 0 at 5 decimals): 1380848
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/ES.txt
           ID    PIT Date HistCurrency  FiscalPeriod        es
0   C02500770  1998-10-02          Ars          1992       NaN
1   C02500770  1998-10-02          Ars          1993       NaN
2   C02500770  1998-10-02          Ars          1994       NaN
3   C02500770  1998-10-02          Ars          1995       NaN
4   C02500770  1998-10-02          Ars          1996  1.382843
5   C02500770  1998-10-02          Ars          1997 -0.695665
6   C02500770  1999-07-30          Ars          1992       NaN
7   C02500770  1999-07-30          Ars          1993       NaN
8   C02500770  1999-07-30          Ars          1997 -0.702685
9   C02500770  1999-10-01          Ars          1992       NaN
10  C02500770  1999-10-01          Ars          1993       NaN
11  C02500770  1999

### Gp

In [89]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Gp.txt"               # <-- Input file
OUTPUT_FILE_NAME = "GP.txt"  # <-- Output file

NEW_COLUMN = "gp"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(rev - cogs) / (at)"
FORMULA_COLUMNS = ["rev", "cogs", "at"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'gp' created based on formula:
    (rev - cogs) / (at)
Rows used (all required columns non-null): 904364
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/GP.txt
           ID    PIT Date HistCurrency  FiscalPeriod        gp
0   C02500770  1995-12-29          Ars          1992  0.268970
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993  0.236878
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994  0.201818
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995  0.135181
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996  0.166334
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997  0.094231
11  C02500770  1998-07-03          Ars     

### Ig

In [90]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Ig.txt"               # <-- Input file
OUTPUT_FILE_NAME = "IG.txt"  # <-- Output file

NEW_COLUMN = "ig"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(inv - inv_lag1) / (0.5 * (at + at_lag1))"
FORMULA_COLUMNS = ["inv", "inv_lag1", "at", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'ig' created based on formula:
    (inv - inv_lag1) / (0.5 * (at + at_lag1))
Rows used (all required columns non-null): 1884025
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/IG.txt
           ID    PIT Date HistCurrency  FiscalPeriod        ig
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993 -0.030810
2   C02500770  1995-12-29          Ars          1994  0.065061
3   C02500770  1996-05-03          Ars          1995 -0.047951
4   C02500770  1998-07-03          Ars          1996 -0.036206
5   C02500770  1998-07-03          Ars          1997 -0.002569
6   C02500770  1999-10-01          Ars          1998  0.026443
7   C02500770  2000-05-19          Ars          1999       NaN
8   C02500770  2000-05-26          Ars          1999 -0.045453
9   C02520200  1996-05-03          Ars          1987       NaN
10  C02520200  1996-05-03          Ars          1988       NaN
11  C02520200  1996-

### Inv

In [91]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Inv.txt"               # <-- Input file
OUTPUT_FILE_NAME = "INV.txt"  # <-- Output file

NEW_COLUMN = "inv"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ce / ((1/3) * (ce_lag1 + ce_lag2 + ce_lag3))"
FORMULA_COLUMNS = ["ce", "ce_lag1", "ce_lag2", "ce_lag3"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'inv' created based on formula:
    ce / ((1/3) * (ce_lag1 + ce_lag2 + ce_lag3))
Rows used (all required columns non-null): 988891
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/INV.txt
           ID    PIT Date HistCurrency  FiscalPeriod       inv
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993       NaN
2   C02500770  1995-12-29          Ars          1994       NaN
3   C02500770  1996-05-03          Ars          1995  2.603457
4   C02500770  1998-07-03          Ars          1996  0.917631
5   C02500770  1998-07-03          Ars          1997  1.253880
6   C02500770  1999-10-01          Ars          1998  1.257093
7   C02500770  2000-05-19          Ars          1999       NaN
8   C02500770  2000-05-26          Ars          1999  0.759205
9   C02520200  1996-05-03          Ars          1989       NaN
10  C02520200  1996-05-03          Ars          1990       NaN
11  C02520200  1

### Ltg

#### 1) Calculate ACC

In [92]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Ltg.txt"               # <-- Input file
OUTPUT_FILE_NAME = "processed_data_Ltg.txt"  # <-- Output file

NEW_COLUMN = "acc"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(wc - wc_lag1) / (0.5 * (at + at_lag1))"
FORMULA_COLUMNS = ["wc", "wc_lag1", "at", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'acc' created based on formula:
    (wc - wc_lag1) / (0.5 * (at + at_lag1))
Rows used (all required columns non-null): 676068
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Ltg.txt
           ID    PIT Date HistCurrency  FiscalPeriod AnnPITValue_Period  \
0   C02500770  1995-12-29          Ars          1992                  A   
1   C02500770  1995-12-29          Ars          1992                NaN   
2   C02500770  1995-12-29          Ars          1993                  A   
3   C02500770  1995-12-29          Ars          1993                NaN   
4   C02500770  1995-12-29          Ars          1994                  A   
5   C02500770  1995-12-29          Ars          1994                NaN   
6   C02500770  1996-05-03          Ars          1995                  A   
7   C02500770  1996-05-03          Ars          1995                NaN   
8   C02500770  1998-07-03          Ars          1996                  A   
9   C0250077

#### 2) Calculate Ltg

In [93]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Ltg.txt"               # <-- Input file
OUTPUT_FILE_NAME = "LTG.txt"  # <-- Output file

NEW_COLUMN = "ltg"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "noa - noa_lag1 - acc"
FORMULA_COLUMNS = ["noa", "noa_lag1", "acc"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'ltg' created based on formula:
    noa - noa_lag1 - acc
Rows used (all required columns non-null): 421940
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/LTG.txt
           ID    PIT Date HistCurrency  FiscalPeriod       ltg
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993  0.073143
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994  0.011120
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995  0.087325
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996 -0.013402
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997  0.041760
11  C02500770  1998-07-03          Ars  

### Nca

In [94]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Nca.txt"               # <-- Input file
OUTPUT_FILE_NAME = "NCA.txt"  # <-- Output file

NEW_COLUMN = "nca"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(oa - oa_lag1) / (0.5 * (at + at_lag1))"
FORMULA_COLUMNS = ["oa", "oa_lag1", "at", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'nca' created based on formula:
    (oa - oa_lag1) / (0.5 * (at + at_lag1))
Rows used (all required columns non-null): 1638112
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/NCA.txt
           ID    PIT Date HistCurrency  FiscalPeriod       nca
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993  0.038225
2   C02500770  1995-12-29          Ars          1994  0.042944
3   C02500770  1996-05-03          Ars          1995  0.165906
4   C02500770  1998-07-03          Ars          1996 -0.063656
5   C02500770  1998-07-03          Ars          1997  0.059425
6   C02500770  1999-10-01          Ars          1998 -0.039300
7   C02500770  2000-05-19          Ars          1999 -0.027413
8   C02500770  2000-05-26          Ars          1999 -0.103221
9   C02520200  1996-05-03          Ars          1987       NaN
10  C02520200  1996-05-03          Ars          1988       NaN
11  C02520200  1996-

### Noa

In [95]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Noa.txt"               # <-- Input file
OUTPUT_FILE_NAME = "NOA.txt"  # <-- Output file

NEW_COLUMN = "noa_anomaly"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "noa / at_lag1"
FORMULA_COLUMNS = ["noa", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'noa_anomaly' created based on formula:
    noa / at_lag1
Rows used (all required columns non-null): 1531325
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/NOA.txt
           ID    PIT Date HistCurrency  FiscalPeriod  noa_anomaly
0   C02500770  1995-12-29          Ars          1992          NaN
1   C02500770  1995-12-29          Ars          1993     0.203046
2   C02500770  1995-12-29          Ars          1994          NaN
3   C02500770  1996-05-03          Ars          1995          NaN
4   C02500770  1998-07-03          Ars          1996          NaN
5   C02500770  1998-07-03          Ars          1997          NaN
6   C02500770  1999-10-01          Ars          1998          NaN
7   C02500770  2000-05-19          Ars          1999     0.099477
8   C02520200  1996-05-03          Ars          1987          NaN
9   C02520200  1996-05-03          Ars          1988          NaN
10  C02520200  1996-05-03          Ars          1989          NaN
11

### Nwc

In [96]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Nwc.txt"               # <-- Input file
OUTPUT_FILE_NAME = "NWC.txt"  # <-- Output file

NEW_COLUMN = "nwc_anomaly"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(nwc - nwc_lag1) / (0.5 * (at + at_lag1))"
FORMULA_COLUMNS = ["nwc", "nwc_lag1", "at", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'nwc_anomaly' created based on formula:
    (nwc - nwc_lag1) / (0.5 * (at + at_lag1))
Rows used (all required columns non-null): 1486513
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/NWC.txt
           ID    PIT Date HistCurrency  FiscalPeriod  nwc_anomaly
0   C02500770  1995-12-29          Ars          1992          NaN
1   C02500770  1995-12-29          Ars          1993    -0.075276
2   C02500770  1995-12-29          Ars          1994     0.188406
3   C02500770  1996-05-03          Ars          1995     0.017948
4   C02500770  1998-07-03          Ars          1996    -0.091592
5   C02500770  1998-07-03          Ars          1997     0.005449
6   C02500770  1999-10-01          Ars          1998    -0.139677
7   C02500770  2000-05-19          Ars          1999    -0.055923
8   C02520200  1996-05-03          Ars          1987          NaN
9   C02520200  1996-05-03          Ars          1988          NaN
10  C02520200  1996-05-03          Ars  

### Ol

In [97]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Ol.txt"               # <-- Input file
OUTPUT_FILE_NAME = "OL.txt"  # <-- Output file

NEW_COLUMN = "ol"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(cogs + sga) / at"
FORMULA_COLUMNS = ["cogs", "sga", "at"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'ol' created based on formula:
    (cogs + sga) / at
Rows used (all required columns non-null): 616148
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/OL.txt
           ID    PIT Date HistCurrency  FiscalPeriod        ol
0   C02500770  1995-12-29          Ars          1992  0.181059
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993  0.199546
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994  0.147251
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995  0.187383
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996  0.120374
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997  0.108487
11  C02500770  1998-07-03          Ars       

### Osc

In [98]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Computes indicator variables I and J:
#       I = 1 if tl > at, else 0
#       J = 1 if ni_eps + ni_eps_lag1 < 0, else 0
# 3) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns (including I, J).
# 4) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 5) The updated DataFrame is stored in `df_out`.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)  # e.g. Temp_file_path_A = "/path/to/dir"
SEP = "|"

INPUT_FILE = "processed_data_Osc.txt"   # <-- Input file
OUTPUT_FILE_NAME = "OSC.txt"            # <-- Output file

NEW_COLUMN = "osc"                      # <-- Name of the new computed column

# NOTE:
# - use log(at) instead of ln(at); pandas.eval uses log() for natural log
# - abs() works as absolute value
FORMULA_EXPRESSION = (
    "- 1.32"
    " - 0.407 * log(at)"
    " + 6.03 * (tl / at)"
    " - 1.43 * ((ca - cl) / at)"
    " + 0.076 * (cl / ca)"
    " - 1.72 * I"
    " - 2.37 * (ni_extra / at)"
    " - 1.83 * (ifo / tl)"
    " + 0.285 * J"
    " - 0.521 * ((ni_extra - ni_extra_lag1) / (abs(ni_extra) + abs(ni_extra_lag1)))"
)

# Columns that must be non-null for the formula to be applied
FORMULA_COLUMNS = ["at", "tl", "ca", "cl", "ifo", "ni_extra", "ni_extra_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) PREPARE OUTPUT DATAFRAME AND INDICATORS
df_out = df.copy()

# Indicator variables:
# I = 1 if tl > at, else 0
df_out["I"] = (df_out["tl"] > df_out["at"]).astype(int)

# J = 1 if ni_extra + ni_extra_lag1 < 0, else 0
df_out["J"] = ((df_out["ni_extra"] + df_out["ni_extra_lag1"]) < 0).astype(int)

# Initialize new column with NaN
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))

  result = getattr(ufunc, method)(*inputs, **kwargs)


New column 'osc' created based on formula:
    - 1.32 - 0.407 * log(at) + 6.03 * (tl / at) - 1.43 * ((ca - cl) / at) + 0.076 * (cl / ca) - 1.72 * I - 2.37 * (ni_extra / at) - 1.83 * (ifo / tl) + 0.285 * J - 0.521 * ((ni_extra - ni_extra_lag1) / (abs(ni_extra) + abs(ni_extra_lag1)))
Rows used (all required columns non-null): 987762
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/OSC.txt
           ID    PIT Date HistCurrency  FiscalPeriod       osc
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993 -1.507886
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994 -1.418356
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995  0.010150
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770 

### Pm

In [99]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Pm.txt"               # <-- Input file
OUTPUT_FILE_NAME = "PM.txt"  # <-- Output file

NEW_COLUMN = "pm"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(rev - cogs) / rev"
FORMULA_COLUMNS = ["rev", "cogs"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'pm' created based on formula:
    (rev - cogs) / rev
Rows used (all required columns non-null): 1613228
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/PM.txt
           ID    PIT Date HistCurrency  FiscalPeriod        pm
0   C02500770  1995-12-29          Ars          1992  0.170827
1   C02500770  1995-12-29          Ars          1993  0.156230
2   C02500770  1995-12-29          Ars          1994  0.130227
3   C02500770  1996-05-03          Ars          1995  0.121844
4   C02500770  1998-07-03          Ars          1996  0.172871
5   C02500770  1998-07-03          Ars          1997  0.166367
6   C02500770  1999-10-08          Ars          1997  0.262077
7   C02500770  1999-10-01          Ars          1998  0.255589
8   C02500770  2000-05-19          Ars          1999       NaN
9   C02500770  2000-05-26          Ars          1999  0.412727
10  C02520200  1996-05-03          Ars          1987       NaN
11  C02520200  1996-05-03          Ars     

### Poa

In [100]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Poa.txt"               # <-- Input file
OUTPUT_FILE_NAME = "POA.txt"  # <-- Output file

NEW_COLUMN = "poa"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(ni_extra - opcf) / abs(ni_extra)"
FORMULA_COLUMNS = ["ni_extra", "opcf"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'poa' created based on formula:
    (ni_extra - opcf) / abs(ni_extra)
Rows used (all required columns non-null): 1277110
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/POA.txt
           ID    PIT Date HistCurrency  FiscalPeriod        poa
0   C02500770  1995-12-29          Ars          1992        NaN
1   C02500770  1995-12-29          Ars          1993        NaN
2   C02500770  1995-12-29          Ars          1994        NaN
3   C02500770  1996-05-03          Ars          1995        NaN
4   C02500770  1998-07-03          Ars          1996        NaN
5   C02500770  1998-07-03          Ars          1997        NaN
6   C02500770  1999-10-01          Ars          1998        NaN
7   C02500770  2000-05-19          Ars          1999        NaN
8   C02500770  2000-05-26          Ars          1999        NaN
9   C02520200  1996-05-03          Ars          1987        NaN
10  C02520200  1996-05-03          Ars          1988        NaN
11  C02520200 

### Pro

In [101]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Pro.txt"               # <-- Input file
OUTPUT_FILE_NAME = "PRO.txt"  # <-- Output file

NEW_COLUMN = "pro"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ni_eps / at_lag1"
FORMULA_COLUMNS = ["ni_eps", "at_lag1"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'pro' created based on formula:
    ni_eps / at_lag1
Rows used (all required columns non-null): 897896
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/PRO.txt
           ID    PIT Date HistCurrency  FiscalPeriod       pro
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993  0.085055
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994  0.087236
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995 -0.029745
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996  0.030040
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997 -0.060711
11  C02500770  1998-07-03          Ars      

### Pta

In [102]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is applied only to rows where the REQUIRED columns are non-null
#    (ni_extra, opcf). Optional columns (socaps, pocaps, div, fincf, invcf)
#    may be NaN and are treated as 0 for the calculation.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Pta.txt"               # <-- Input file
OUTPUT_FILE_NAME = "PTA.txt"              # <-- Output file

NEW_COLUMN = "pta"                        # <-- Name of the new computed column
FORMULA_EXPRESSION = (
    "(ni_extra + socaps - pocaps - div - opcf - fincf + invcf) / abs(ni_extra)"
)

# Columns that participate in the formula (must exist as columns)
FORMULA_COLUMNS = ["ni_extra", "socaps", "pocaps", "div", "opcf", "fincf", "invcf"]

# REQUIRED: must be non-null for PTA to be computed
REQUIRED_COLUMNS = ["ni_extra", "opcf", "fincf", "invcf", "socaps", "pocaps", "div"]

# OPTIONAL: may be NaN, will be treated as 0 in the formula
OPTIONAL_COLUMNS = []
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS FOR THE FORMULA EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK:
#    - REQUIRED columns must be non-null
#    - ni_extra must be non-zero (avoid division by zero)
mask_required_present = df_out[REQUIRED_COLUMNS].notna().all(axis=1)
mask_valid = mask_required_present & (df_out["ni_extra"] != 0)

# 5) TEMPORARILY FILL OPTIONAL COLUMNS WITH 0 FOR CALCULATION
df_tmp = df_out.copy()
df_tmp[OPTIONAL_COLUMNS] = df_tmp[OPTIONAL_COLUMNS].fillna(0)

#    APPLY FORMULA ONLY WHERE mask_valid IS TRUE
df_out.loc[mask_valid, NEW_COLUMN] = df_tmp.loc[mask_valid].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows with valid inputs (ni_extra & opcf non-null and ni_extra != 0):", mask_valid.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'pta' created based on formula:
    (ni_extra + socaps - pocaps - div - opcf - fincf + invcf) / abs(ni_extra)
Rows with valid inputs (ni_extra & opcf non-null and ni_extra != 0): 998787
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/PTA.txt
           ID    PIT Date HistCurrency  FiscalPeriod       pta
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993       NaN
2   C02500770  1995-12-29          Ars          1994       NaN
3   C02500770  1996-05-03          Ars          1995       NaN
4   C02500770  1996-05-31          Ars          1995       NaN
5   C02500770  1998-07-03          Ars          1996       NaN
6   C02500770  1998-07-03          Ars          1997       NaN
7   C02500770  1999-10-01          Ars          1998       NaN
8   C02500770  2000-05-19          Ars          1999       NaN
9   C02500770  2000-05-26          Ars          1999       NaN
10  C02520200  1996-05-0

### Roe

In [103]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Roe.txt"               # <-- Input file
OUTPUT_FILE_NAME = "ROE.txt"  # <-- Output file

NEW_COLUMN = "roe"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "ni_eps / be"
FORMULA_COLUMNS = ["ni_eps", "be"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'roe' created based on formula:
    ni_eps / be
Rows used (all required columns non-null): 808418
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/ROE.txt
           ID    PIT Date HistCurrency  FiscalPeriod       roe
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993       NaN
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994       NaN
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995       NaN
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996       NaN
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997 -0.262669
11  C02500770  1998-07-03          Ars          1

### Rs

#### Drift

In [104]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN = "Drift") representing the expected
#    annual Revenue change ("drift") at time t.
#
# Definition of Drift:
# --------------------
# Drift_t is defined as the mean of the prior three year-to-year Revenue changes,
# using only information available strictly before time t:
#
#   Drift_t = [ (rev_{t-1} - rev_{t-2})
#             + (rev_{t-2} - rev_{t-3})
#             + (rev_{t-3} - rev_{t-4}) ] / 3
#
# Column conventions:
#   rev        = Revenue at time t
#   rev_lag1   = Revenue at time t-1
#   rev_lag2   = Revenue at time t-2
#   rev_lag3   = Revenue at time t-3
#   rev_lag4   = Revenue at time t-4
#
# Implementation details:
# -----------------------
# - Drift is computed ONLY for rows where rev_lag1 .. rev_lag4 are all non-null.
# - If any required lag is missing, Drift is set to NaN.
# - No partial averages, fallbacks, or tolerance logic are applied.
# - The updated DataFrame overwrites the original file on disk.
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Rs.txt"   # Input file name
NEW_COLUMN = "Drift"                   # Name of the computed column

# Required Revenue level columns (see summary for definitions)
REQUIRED_COLUMNS = ["rev_lag1", "rev_lag2", "rev_lag3", "rev_lag4"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_cols = [c for c in REQUIRED_COLUMNS if c not in df.columns]
if missing_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for drift calculation: {missing_cols}"
    )

# 3) INITIALIZE OUTPUT DATAFRAME AND NEW COLUMN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK: ALL REQUIRED LAGS MUST BE NON-NULL
mask_all_present = df_out[REQUIRED_COLUMNS].notna().all(axis=1)

# 5) COMPUTE PRIOR Revenue CHANGES
d1 = df_out["rev_lag1"] - df_out["rev_lag2"]  # rev_{t-1} - rev_{t-2}
d2 = df_out["rev_lag2"] - df_out["rev_lag3"]  # rev_{t-2} - rev_{t-3}
d3 = df_out["rev_lag3"] - df_out["rev_lag4"]  # rev_{t-3} - rev_{t-4}

# 6) COMPUTE DRIFT (ONLY WHERE ALL INPUTS ARE PRESENT)
df_out.loc[mask_all_present, NEW_COLUMN] = (
    d1 + d2 + d3
).loc[mask_all_present] / 3.0

# 7) SAVE RESULT (OVERWRITE INPUT FILE)
output_path = OUTPUT_DIR / INPUT_FILE
df_out.to_csv(output_path, sep=SEP, index=False)

# 8) LOG SUMMARY
print(f"New column '{NEW_COLUMN}' created as mean of prior three Revenue changes.")
print("Rows with all required non-null lags used:", int(mask_all_present.sum()))
print("Result was saved to:", output_path)
print(df_out[REQUIRED_COLUMNS + [NEW_COLUMN]].head(20))


New column 'Drift' created as mean of prior three Revenue changes.
Rows with all required non-null lags used: 1026549
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Rs.txt
       rev_lag1     rev_lag2     rev_lag3     rev_lag4       Drift
0           NaN          NaN          NaN          NaN         NaN
1   1707.334371          NaN          NaN          NaN         NaN
2   1921.517802  1707.334371          NaN          NaN         NaN
3   2263.705199  1921.517802  1707.334371          NaN         NaN
4   1447.331160  2263.705199  1921.517802  1707.334371  -86.667737
5   1217.952697  1447.331160  2263.705199  1921.517802 -234.521702
6    693.936079  1217.952697  1447.331160  2263.705199 -523.256373
7    609.279252   693.936079  1217.952697  1447.331160 -279.350636
8    609.279252   693.936079  1217.952697  1447.331160 -279.350636
9           NaN          NaN          NaN          NaN         NaN
10     8.063120          NaN          NaN    

#### sd

In [105]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN = "sd") representing the SAMPLE standard
#    deviation (ddof=1) of the prior-three Revenue CHANGE deviations from Drift.
#
# Context / consistency with Drift:
# ---------------------------------
# You defined:
#   rev        = Revenue at time t
#   rev_lag1   = Revenue at time t-1
#   rev_lag2   = Revenue at time t-2
#   rev_lag3   = Revenue at time t-3
#   rev_lag4   = Revenue at time t-4
#
# Drift_t is assumed to already exist in the file and is defined as:
#   Drift_t = mean( (rev_{t-1}-rev_{t-2}), (rev_{t-2}-rev_{t-3}), (rev_{t-3}-rev_{t-4}) )
#
# SD definition:
# --------------
# Let the three prior changes be:
#   d1 = rev_lag1 - rev_lag2   # (t-1) - (t-2)
#   d2 = rev_lag2 - rev_lag3   # (t-2) - (t-3)
#   d3 = rev_lag3 - rev_lag4   # (t-3) - (t-4)
#
# Then:
#   sd_t = sample_std( [d1 - Drift_t, d2 - Drift_t, d3 - Drift_t] )  with ddof=1
#
# Implementation details:
# -----------------------
# - sd is computed ONLY for rows where erev_lag1..rev_lag4 are all non-null AND
#   Drift is non-null.
# - If any required input is missing, sd is set to NaN.
# - No partial computations, fallbacks, or tolerance logic are applied.
# - The updated DataFrame overwrites the original file on disk.
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Rs.txt"  # File that already contains 'Drift'
NEW_COLUMN = "sd"                     # Name of the new computed column

# Required Revenue level columns to build the three prior changes
LEVEL_COLUMNS = ["rev_lag1", "rev_lag2", "rev_lag3", "rev_lag4"]
DRIFT_COL = "Drift"
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing = [c for c in (LEVEL_COLUMNS + [DRIFT_COL]) if c not in df.columns]
if missing:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for sd calculation: {missing}"
    )

# 3) INITIALIZE OUTPUT DATAFRAME AND NEW COLUMN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) ENSURE NUMERIC TYPES (COERCE NON-NUMERIC TO NaN)
df_out[DRIFT_COL] = pd.to_numeric(df_out[DRIFT_COL], errors="coerce")
for c in LEVEL_COLUMNS:
    df_out[c] = pd.to_numeric(df_out[c], errors="coerce")

# 5) BUILD MASK: ALL REQUIRED LAGS MUST BE NON-NULL AND Drift MUST BE NON-NULL
mask_all_present = df_out[LEVEL_COLUMNS].notna().all(axis=1) & df_out[DRIFT_COL].notna()

# 6) COMPUTE THE THREE PRIOR Revenue CHANGES (d1, d2, d3)
d1 = df_out["rev_lag1"] - df_out["rev_lag2"]  # rev_{t-1} - rev_{t-2}
d2 = df_out["rev_lag2"] - df_out["rev_lag3"]  # rev_{t-2} - rev_{t-3}
d3 = df_out["rev_lag3"] - df_out["rev_lag4"]  # rev_{t-3} - rev_{t-4}

# 7) COMPUTE SAMPLE STANDARD DEVIATION OF (di - Drift), i in {1,2,3}
#    We form a 3-column DataFrame of deviations and then take row-wise std(ddof=1).
dev = pd.DataFrame(
    {
        "dev_d1": d1 - df_out[DRIFT_COL],
        "dev_d2": d2 - df_out[DRIFT_COL],
        "dev_d3": d3 - df_out[DRIFT_COL],
    },
    index=df_out.index,
)

df_out.loc[mask_all_present, NEW_COLUMN] = dev.loc[mask_all_present].std(axis=1, ddof=1)

# 8) SAVE RESULT (OVERWRITE INPUT FILE)
output_path = OUTPUT_DIR / INPUT_FILE
df_out.to_csv(output_path, sep=SEP, index=False)

# 9) LOG SUMMARY
print(
    f"New column '{NEW_COLUMN}' created as sample std (ddof=1) of [(d1-Drift),(d2-Drift),(d3-Drift)], "
    f"where d1=rev_lag1-rev_lag2, d2=rev_lag2-rev_lag3, d3=rev_lag3-rev_lag4."
)
print("Rows with all required non-null lags and non-null Drift used:", int(mask_all_present.sum()))
print("Result was saved to:", output_path)
print(df_out[LEVEL_COLUMNS + [DRIFT_COL, NEW_COLUMN]].head(20))


New column 'sd' created as sample std (ddof=1) of [(d1-Drift),(d2-Drift),(d3-Drift)], where d1=rev_lag1-rev_lag2, d2=rev_lag2-rev_lag3, d3=rev_lag3-rev_lag4.
Rows with all required non-null lags and non-null Drift used: 1026549
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Rs.txt
       rev_lag1     rev_lag2     rev_lag3     rev_lag4       Drift          sd
0           NaN          NaN          NaN          NaN         NaN         NaN
1   1707.334371          NaN          NaN          NaN         NaN         NaN
2   1921.517802  1707.334371          NaN          NaN         NaN         NaN
3   2263.705199  1921.517802  1707.334371          NaN         NaN         NaN
4   1447.331160  2263.705199  1921.517802  1707.334371  -86.667737  635.176920
5   1217.952697  1447.331160  2263.705199  1921.517802 -234.521702  579.297842
6    693.936079  1217.952697  1447.331160  2263.705199 -523.256373  293.498526
7    609.279252   693.936079  1217.95269

#### Rs

In [106]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN = "rs") as a standardized Revenue surprise:
#
#       rs_t = ( (rev_t - rev_{t-1}) - Drift_t ) / sd_t
#
#    where:
#      - rev        = rev_t
#      - rev_lag1   = rev_{t-1}
#      - Drift      = mean of prior Revenue changes (already computed in the file)
#      - sd         = sample std (ddof=1) of prior-change deviations from Drift
#                    (already computed in the file)
#
# 3) The formula is applied ONLY when ALL required inputs are non-null AND
#    sd is non-zero. Otherwise, es is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) The formula is computed using DataFrame.eval (engine="python").
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The result is written to disk under OUTPUT_FILE_NAME.
#
# Notes:
# - No partial computations are performed: if any input is missing, output is NaN.
# - An explicit guard is applied to avoid division by zero (sd == 0).
# - If the expression evaluates to non-finite values (inf/-inf), they are set to NaN.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Rs.txt"      # Input file (must already contain Drift and sd)
OUTPUT_FILE_NAME = "RS.txt"               # Output file

NEW_COLUMN = "rs"                         # Name of the computed column
FORMULA_EXPRESSION = "((rev - rev_lag1) - Drift) / sd"
FORMULA_COLUMNS = ["rev", "rev_lag1", "Drift", "sd"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) ENSURE NUMERIC TYPES (COERCE NON-NUMERIC TO NaN)
for c in FORMULA_COLUMNS:
    df_out[c] = pd.to_numeric(df_out[c], errors="coerce")

# 5) BUILD MASK WHERE:
#    - all required inputs are non-null, AND
#    - sd is non-zero (avoid division by zero)
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1) & (df_out["sd"] != 0)

# 6) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) SANITIZE NON-FINITE RESULTS (inf/-inf) -> NaN
#    This can happen if sd is extremely close to zero or due to numeric issues.
df_out.loc[~np.isfinite(df_out[NEW_COLUMN]), NEW_COLUMN] = np.nan

# 8) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 9) DEFINE OUTPUT FILE PATH (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 10) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

# 11) PRINT SUMMARY
print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required inputs non-null and sd != 0):", int(mask_all_present.sum()))
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'rs' created based on formula:
    ((rev - rev_lag1) - Drift) / sd
Rows used (all required inputs non-null and sd != 0): 1007091
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/RS.txt
           ID    PIT Date HistCurrency  FiscalPeriod        rs
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993       NaN
2   C02500770  1995-12-29          Ars          1994       NaN
3   C02500770  1996-05-03          Ars          1995       NaN
4   C02500770  1998-07-03          Ars          1996 -0.224679
5   C02500770  1998-07-03          Ars          1997 -0.499734
6   C02500770  1999-10-01          Ars          1998  1.494384
7   C02500770  2000-05-19          Ars          1999  1.026737
8   C02500770  2000-05-26          Ars          1999  0.766257
9   C02520200  1996-05-03          Ars          1989       NaN
10  C02520200  1996-05-03          Ars          1990       NaN
11  C02520200  1996

### Sg

In [107]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Sg.txt"               # <-- Input file
OUTPUT_FILE_NAME = "SG.txt"  # <-- Output file

NEW_COLUMN = "sg"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(be / be_lag1) - 1"
FORMULA_COLUMNS = ["be_lag1", "be"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'sg' created based on formula:
    (be / be_lag1) - 1
Rows used (all required columns non-null): 1286154
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/SG.txt
           ID    PIT Date HistCurrency  FiscalPeriod        sg
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993       NaN
2   C02500770  1995-12-29          Ars          1994       NaN
3   C02500770  1996-05-03          Ars          1995       NaN
4   C02500770  1998-07-03          Ars          1996       NaN
5   C02500770  1998-07-03          Ars          1997       NaN
6   C02500770  1999-10-01          Ars          1998 -0.745101
7   C02500770  2000-05-19          Ars          1999  0.523340
8   C02520200  1996-05-03          Ars          1987       NaN
9   C02520200  1996-05-03          Ars          1988       NaN
10  C02520200  1996-05-03          Ars          1989       NaN
11  C02520200  1996-05-03          Ars     

### Sli

#### 1) Calculate Sag

In [108]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Sli.txt"               # <-- Input file
OUTPUT_FILE_NAME = "processed_data_Sli.txt"  # <-- Output file

NEW_COLUMN = "sag"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(rev - 0.5 * (rev_lag1 + rev_lag2)) / (0.5 * (rev_lag1 + rev_lag2))"
FORMULA_COLUMNS = ["rev", "rev_lag1", "rev_lag2"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'sag' created based on formula:
    (rev - 0.5 * (rev_lag1 + rev_lag2)) / (0.5 * (rev_lag1 + rev_lag2))
Rows used (all required columns non-null): 1933038
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Sli.txt
           ID    PIT Date HistCurrency  FiscalPeriod AnnPITValue_Period  \
0   C02500770  1995-12-29          Ars          1992                  A   
1   C02500770  1995-12-29          Ars          1992                NaN   
2   C02500770  1995-12-29          Ars          1993                  A   
3   C02500770  1995-12-29          Ars          1993                NaN   
4   C02500770  1995-12-29          Ars          1994                  A   
5   C02500770  1995-12-29          Ars          1994                NaN   
6   C02500770  1996-05-03          Ars          1995                  A   
7   C02500770  1996-05-03          Ars          1995                NaN   
8   C02500770  1998-07-03          Ars          1996      

#### 2) Calculate Ivg

In [109]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Sli.txt"               # <-- Input file
OUTPUT_FILE_NAME = "processed_data_Sli.txt"  # <-- Output file

NEW_COLUMN = "ivg"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(inv - 0.5 * (inv_lag1 + inv_lag2)) / (0.5 * (inv_lag1 + inv_lag2))"
FORMULA_COLUMNS = ["inv", "inv_lag1", "inv_lag2"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'ivg' created based on formula:
    (inv - 0.5 * (inv_lag1 + inv_lag2)) / (0.5 * (inv_lag1 + inv_lag2))
Rows used (all required columns non-null): 1636079
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Sli.txt
           ID    PIT Date HistCurrency  FiscalPeriod AnnPITValue_Period  \
0   C02500770  1995-12-29          Ars          1992                  A   
1   C02500770  1995-12-29          Ars          1992                NaN   
2   C02500770  1995-12-29          Ars          1993                  A   
3   C02500770  1995-12-29          Ars          1993                NaN   
4   C02500770  1995-12-29          Ars          1994                  A   
5   C02500770  1995-12-29          Ars          1994                NaN   
6   C02500770  1996-05-03          Ars          1995                  A   
7   C02500770  1996-05-03          Ars          1995                NaN   
8   C02500770  1998-07-03          Ars          1996      

#### 3) Calculate Sli

In [110]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Sli.txt"               # <-- Input file
OUTPUT_FILE_NAME = "SLI.txt"  # <-- Output file

NEW_COLUMN = "sli"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "sag - ivg"
FORMULA_COLUMNS = ["sag", "ivg"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'sli' created based on formula:
    sag - ivg
Rows used (all required columns non-null): 680711
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/SLI.txt
           ID    PIT Date HistCurrency  FiscalPeriod       sli
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993       NaN
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994 -0.074458
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995 -0.219070
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1998-07-03          Ars          1996 -0.033297
9   C02500770  1998-07-03          Ars          1996       NaN
10  C02500770  1998-07-03          Ars          1997 -0.347189
11  C02500770  1998-07-03          Ars          199

### Slx

#### 1) Calculate Sag

In [111]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Slx.txt"               # <-- Input file
OUTPUT_FILE_NAME = "processed_data_Slx.txt"  # <-- Output file

NEW_COLUMN = "sag"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(rev - 0.5 * (rev_lag1 + rev_lag2)) / (0.5 * (rev_lag1 + rev_lag2))"
FORMULA_COLUMNS = ["rev", "rev_lag1", "rev_lag2"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'sag' created based on formula:
    (rev - 0.5 * (rev_lag1 + rev_lag2)) / (0.5 * (rev_lag1 + rev_lag2))
Rows used (all required columns non-null): 1481572
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Slx.txt
           ID    PIT Date HistCurrency  FiscalPeriod  AnnPITValue_Period  \
0   C02500770  1995-12-29          Ars          1992                 NaN   
1   C02500770  1995-12-29          Ars          1993                 NaN   
2   C02500770  1995-12-29          Ars          1994                 NaN   
3   C02500770  1996-05-03          Ars          1995                 NaN   
4   C02500770  1998-07-03          Ars          1996                 NaN   
5   C02500770  1998-07-03          Ars          1997                 NaN   
6   C02500770  1999-10-01          Ars          1998                 NaN   
7   C02500770  1999-10-08          Ars          1997                 NaN   
8   C02500770  2000-05-19          Ars          1

#### 2) Calculate Xg

In [112]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Slx.txt"               # <-- Input file
OUTPUT_FILE_NAME = "processed_data_Slx.txt"  # <-- Output file

NEW_COLUMN = "xg"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(sga - 0.5 * (sga_lag1 + sga_lag2)) / (0.5 * (sga_lag1 + sga_lag2))"
FORMULA_COLUMNS = ["sga", "sga_lag1", "sga_lag2"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'xg' created based on formula:
    (sga - 0.5 * (sga_lag1 + sga_lag2)) / (0.5 * (sga_lag1 + sga_lag2))
Rows used (all required columns non-null): 1128389
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Slx.txt
           ID    PIT Date HistCurrency  FiscalPeriod  AnnPITValue_Period  \
0   C02500770  1995-12-29          Ars          1992                 NaN   
1   C02500770  1995-12-29          Ars          1993                 NaN   
2   C02500770  1995-12-29          Ars          1994                 NaN   
3   C02500770  1996-05-03          Ars          1995                 NaN   
4   C02500770  1998-07-03          Ars          1996                 NaN   
5   C02500770  1998-07-03          Ars          1997                 NaN   
6   C02500770  1999-10-01          Ars          1998                 NaN   
7   C02500770  1999-10-08          Ars          1997                 NaN   
8   C02500770  2000-05-19          Ars          19

#### 3) Calculate Slx

In [113]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Slx.txt"               # <-- Input file
OUTPUT_FILE_NAME = "SLX.txt"  # <-- Output file

NEW_COLUMN = "slx"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "sag - xg"
FORMULA_COLUMNS = ["sag", "xg"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'slx' created based on formula:
    sag - xg
Rows used (all required columns non-null): 1053800
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/SLX.txt
           ID    PIT Date HistCurrency  FiscalPeriod       slx
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1993       NaN
2   C02500770  1995-12-29          Ars          1994  0.528419
3   C02500770  1996-05-03          Ars          1995 -0.169804
4   C02500770  1998-07-03          Ars          1996 -0.080627
5   C02500770  1998-07-03          Ars          1997 -0.222608
6   C02500770  1999-10-01          Ars          1998 -0.003509
7   C02500770  1999-10-08          Ars          1997 -0.138541
8   C02500770  2000-05-19          Ars          1999       NaN
9   C02500770  2000-05-26          Ars          1999 -0.038820
10  C02520200  1996-05-03          Ars          1988       NaN
11  C02520200  1996-05-03          Ars          198

### Tx

In [114]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Tx.txt"               # <-- Input file
OUTPUT_FILE_NAME = "TX.txt"  # <-- Output file

NEW_COLUMN = "tx"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "itx / (0.35 * ni_extra)"
FORMULA_COLUMNS = ["itx", "ni_extra"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'tx' created based on formula:
    itx / (0.35 * ni_extra)
Rows used (all required columns non-null): 1341007
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/TX.txt
           ID    PIT Date HistCurrency  FiscalPeriod        tx
0   C02500770  1995-12-29          Ars          1992  0.718369
1   C02500770  1995-12-29          Ars          1993  0.619786
2   C02500770  1995-12-29          Ars          1994  0.694892
3   C02500770  1996-05-03          Ars          1995 -0.311765
4   C02500770  1998-07-03          Ars          1996  0.184203
5   C02500770  1998-07-03          Ars          1997       NaN
6   C02500770  1999-10-01          Ars          1998 -0.000245
7   C02500770  2000-05-19          Ars          1999       NaN
8   C02500770  2000-05-26          Ars          1999 -0.086758
9   C02520200  1996-05-03          Ars          1987       NaN
10  C02520200  1996-05-03          Ars          1988       NaN
11  C02520200  1996-05-03          Ars

### Txf

In [115]:
from pathlib import Path
import pandas as pd
import numpy as np

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column (NEW_COLUMN) based on a flexible arithmetic formula
#    defined in FORMULA_EXPRESSION using existing columns.
# 3) The formula is only applied to rows where ALL columns listed in
#    FORMULA_COLUMNS are non-null. Otherwise, the result is set to NaN.
# 4) The updated DataFrame is stored in `df_out`.
# 5) A new column (NEW_COLUMN) is computed using DataFrame.eval.
# 6) All columns except "ID", "PIT Date", "HistCurrency", "FiscalPeriod",
#    and NEW_COLUMN are dropped.
# 7) The output filename is defined via config.
# 8) The result is written to disk using the configured output name.
# =============================================================================

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "processed_data_Txf.txt"               # <-- Input file
OUTPUT_FILE_NAME = "TXF.txt"  # <-- Output file

NEW_COLUMN = "txf"                             # <-- Name of the new computed column
FORMULA_EXPRESSION = "(socaps - pocaps - div + diss - dred) / (0.5 * (at_lag1 + at_lag2))"
FORMULA_COLUMNS = ["socaps", "pocaps", "div", "diss", "dred", "at_lag1", "at_lag2"]
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT ALL REQUIRED COLUMNS EXIST
missing_formula_cols = [c for c in FORMULA_COLUMNS if c not in df.columns]
if missing_formula_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing required columns for formula: {missing_formula_cols}"
    )

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) BUILD MASK WHERE ALL REQUIRED COLUMNS ARE NON-NULL
mask_all_present = df_out[FORMULA_COLUMNS].notna().all(axis=1)

# 5) APPLY FORMULA ONLY WHERE mask_all_present IS TRUE
df_out.loc[mask_all_present, NEW_COLUMN] = df_out.loc[mask_all_present].eval(
    FORMULA_EXPRESSION,
    engine="python",
)

# 6) DROP ALL COLUMNS EXCEPT THE REQUIRED ONES
KEEP_COLUMNS = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", NEW_COLUMN]

missing_keep_cols = [c for c in KEEP_COLUMNS if c not in df_out.columns]
if missing_keep_cols:
    raise ValueError(f"Columns expected to be kept but not found: {missing_keep_cols}")

df_out = df_out[KEEP_COLUMNS]

# 7) DEFINE OUTPUT FILE NAME (FROM CONFIG)
formula_output_path = OUTPUT_DIR / OUTPUT_FILE_NAME

# 8) SAVE RESULT
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created based on formula:")
print("   ", FORMULA_EXPRESSION)
print("Rows used (all required columns non-null):", mask_all_present.sum())
print("Result was saved to:", formula_output_path)
print(df_out.head(20))


New column 'txf' created based on formula:
    (socaps - pocaps - div + diss - dred) / (0.5 * (at_lag1 + at_lag2))
Rows used (all required columns non-null): 339351
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/TXF.txt
           ID    PIT Date HistCurrency  FiscalPeriod       txf
0   C02500770  1995-12-29          Ars          1992       NaN
1   C02500770  1995-12-29          Ars          1992       NaN
2   C02500770  1995-12-29          Ars          1993       NaN
3   C02500770  1995-12-29          Ars          1993       NaN
4   C02500770  1995-12-29          Ars          1994 -0.006296
5   C02500770  1995-12-29          Ars          1994       NaN
6   C02500770  1996-05-03          Ars          1995       NaN
7   C02500770  1996-05-03          Ars          1995       NaN
8   C02500770  1996-05-31          Ars          1995       NaN
9   C02500770  1998-07-03          Ars          1996  0.000000
10  C02500770  1998-07-03          Ars          1996    

## Clean Anomaly Data

In [116]:
from pathlib import Path
import numpy as np
import pandas as pd

# =============================================================================
# SUMMARY OF THIS SCRIPT
# -----------------------------------------------------------------------------
# This script performs standardized cleaning and filtering of firm-level
# financial indicator datasets. It includes detailed DATA LOSS TRACKING.
#
# Steps:
# 1. Loads the dataset.
# 2. Tracks Row/Firm-Year counts at every step.
# 3. Filters out countries with fewer than MIN_UNIQUE_IDS_PER_COUNTRY firms.
# 4. Determines target column (base stem or anomaly).
# 5. Cleans target column (Numeric, remove NaN/Inf).
# 6. Keeps only the newest FiscalPeriod per firm.
# 7. Applies anomaly filters from Bowles et al. (ES, POA, PTA).
# 8. Removes duplicates (retries earliest PIT Date).
# 9. Trims top/bottom 0.5% of values per country.
# 10. Saves output and prints a detailed Loss Report.
# =============================================================================


# =============================================================================
# CONFIGURATION
# =============================================================================

# Base directory containing all raw input text files
# Ensure 'Temp_file_path_A' is defined in your environment
DATA_DIR = Path(Temp_file_path_A)

# Explicit list of datasets to process
input_files = [
    DATA_DIR / "ACC.txt",
    DATA_DIR / "AG.txt",
    DATA_DIR / "AT.txt",
    DATA_DIR / "CAT.txt",
    DATA_DIR / "CPM.txt",
    DATA_DIR / "EC.txt",
    DATA_DIR / "ES.txt",
    DATA_DIR / "GP.txt",
    DATA_DIR / "IG.txt",
    DATA_DIR / "INV.txt",
    DATA_DIR / "LTG.txt",
    DATA_DIR / "NCA.txt",
    DATA_DIR / "NOA.txt",
    DATA_DIR / "NWC.txt",
    DATA_DIR / "OL.txt",
    DATA_DIR / "OSC.txt",
    DATA_DIR / "PM.txt",
    DATA_DIR / "POA.txt",
    DATA_DIR / "PRO.txt",
    DATA_DIR / "PTA.txt",
    DATA_DIR / "ROE.txt",
    DATA_DIR / "RS.txt",
    #DATA_DIR / "SAG.txt",
    DATA_DIR / "SG.txt",
    DATA_DIR / "SLI.txt",
    DATA_DIR / "SLX.txt",
    DATA_DIR / "TX.txt",
    DATA_DIR / "TXF.txt",
]

# Minimum number of distinct firms required per country
MIN_UNIQUE_IDS_PER_COUNTRY = 30


# =============================================================================
# HELPER: TRACKING FUNCTION
# =============================================================================
def get_counts(df):
    """
    Returns a tuple: (Total Rows, Unique Firm-Years).
    Unique Firm-Years are defined as unique combinations of (ID, FiscalPeriod).
    """
    if df.empty:
        return 0, 0
    
    rows = len(df)
    
    # Count unique (ID, FiscalPeriod) pairs
    if "ID" in df.columns and "FiscalPeriod" in df.columns:
        # Create a temporary view to ensure we are counting valid numeric periods
        # (matches the logic used in cleaning)
        temp_fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
        valid_mask = temp_fp.notna()
        
        unique_fy = (
            df.loc[valid_mask, ["ID", "FiscalPeriod"]]
            .drop_duplicates(subset=["ID", "FiscalPeriod"])
            .shape[0]
        )
    else:
        unique_fy = 0
        
    return rows, unique_fy


# =============================================================================
# CLEANING FUNCTION (Verified & Updated)
# =============================================================================
def clean_single_result(df: pd.DataFrame, target_col: str, tracker_list: list) -> pd.DataFrame:
    """
    Performs column cleaning and tracks data loss inside the function.
    """
    # Work on a defensive copy
    df = df.copy()

    # --- 1. Validate Column ---
    if target_col not in df.columns:
        raise KeyError(
            f"Expected target column '{target_col}' not found. "
            f"Available columns: {list(df.columns)}"
        )

    print(f"  Using target column: '{target_col}'")

    # --- 2. Numeric Conversion & Infinity Check ---
    df[target_col] = pd.to_numeric(df[target_col], errors="coerce")

    mask_inf = df[target_col].isin([np.inf, -np.inf])
    if mask_inf.any():
        print(f"  Dropping {mask_inf.sum()} rows with ±infinity in '{target_col}'.")
        df = df[~mask_inf].reset_index(drop=True)
    
    # LOG STEP
    r, fy = get_counts(df)
    tracker_list.append({"Step": "Clean: Remove Infinity", "Rows": r, "FirmYears": fy})

    # --- 3. Drop Missing/Empty ---
    before_rows = len(df)
    df = df[df[target_col].notna() & (df[target_col] != "")].reset_index(drop=True)
    print(f"  Dropped {before_rows - len(df)} empty rows in '{target_col}'.")
    
    # LOG STEP
    r, fy = get_counts(df)
    tracker_list.append({"Step": "Clean: Remove NaNs/Empty", "Rows": r, "FirmYears": fy})

    # --- 4. FiscalPeriod Processing ---
    if "FiscalPeriod" not in df.columns:
        raise KeyError("Column 'FiscalPeriod' is missing.")
    df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
    
    # Drop rows where FiscalPeriod became NaN
    df = df[df["FiscalPeriod"].notna()].reset_index(drop=True)

    # LOG STEP
    r, fy = get_counts(df)
    tracker_list.append({"Step": "Clean: Valid FiscalPeriod", "Rows": r, "FirmYears": fy})

    # =========================================================================
    # FILTER 1: DEDUPLICATE ON DATE (Handle same-day conflicts)
    # =========================================================================
    if "PIT Date" in df.columns:
        df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
        df = df.dropna(subset=["PIT Date"])

        # Sort by ID -> Date -> FiscalPeriod (Ascending)
        # Result: If multiple rows have the SAME ID and SAME Date, 
        # the one with the highest (newest) FiscalPeriod is last.
        df = df.sort_values(by=["ID", "PIT Date", "FiscalPeriod"])
        
        # Keep ONLY the last row for each unique (ID, Date) pair
        df = df.drop_duplicates(subset=["ID", "PIT Date"], keep="last").reset_index(drop=True)

        # LOG STEP
        r, fy = get_counts(df)
        tracker_list.append({"Step": "Filter: Best FP per Date", "Rows": r, "FirmYears": fy})
    else:
        print("  WARNING: 'PIT Date' missing. Skipping Date-level dedup.")

    # =========================================================================
    # FILTER 2: STALE DATA CHECK (Handle Zombie/Digitized Data)
    # =========================================================================
    # Logic: If (PIT Date - Fiscal Year End) > 3 Years (1095 Days), DROP IT.
    # This specifically targets rows like: Date=2022, FiscalPeriod=1986.
    if "PIT Date" in df.columns:
        # Create a temporary FYE date (Assume Dec 31 of that Fiscal Period)
        temp_years = df["FiscalPeriod"].astype(int)
        
        fye_dates = pd.to_datetime(
            temp_years.astype(str) + "-12-31", 
            format="%Y-%m-%d", 
            errors="coerce"
        )
        
        # Calculate lag in days
        lag_days = (df["PIT Date"] - fye_dates).dt.days
        
        # Keep if Fresh (<= 3 years). We do NOT drop negative lags here 
        # (interim data is usually fine here, or handled elsewhere).
        mask_fresh = lag_days <= 1095
        
        dropped_count = (~mask_fresh).sum()
        df = df[mask_fresh].reset_index(drop=True)
        
        if dropped_count > 0:
            print(f"  Dropped {dropped_count} stale rows (>3 years old).")

        # LOG STEP
        r, fy = get_counts(df)
        tracker_list.append({"Step": "Filter: Stale (>3 Yrs)", "Rows": r, "FirmYears": fy})

    
    # Cast FiscalPeriod to Int64 for clean formatting
    df["FiscalPeriod"] = df["FiscalPeriod"].astype("Int64")

    return df


# =============================================================================
# MAIN PROCESSING LOOP
# =============================================================================

# Collector for all dropped countries across datasets
country_removal_log = []

for input_path in input_files:
    print("\n" + "=" * 80)
    print(f"Processing file: {input_path.name}")
    
    # Initialize tracker for this file
    file_tracker = []

    try:
        # Load dataset
        df_in = pd.read_csv(input_path, sep="|")
        
        # LOG: Initial Load
        r, fy = get_counts(df_in)
        file_tracker.append({"Step": "1. Raw Load", "Rows": r, "FirmYears": fy})
        print(f"  Rows BEFORE: {r}")
        print(f"  Firm-years BEFORE: {fy}")

        # ---------------------------------------------------------------------
        # 1. COUNTRY FILTERING
        # ---------------------------------------------------------------------
        df_in["country_code"] = df_in["ID"].astype(str).str[1:4]

        unique_ids_per_country = (
            df_in.groupby("country_code")["ID"]
            .nunique()
            .reset_index(name="unique_ids")
        )

        countries_below = unique_ids_per_country[
            unique_ids_per_country["unique_ids"] < MIN_UNIQUE_IDS_PER_COUNTRY
        ]

        for _, row in countries_below.iterrows():
            country_removal_log.append({
                "dataset": input_path.name,
                "country_code": row["country_code"],
                "unique_ids": int(row["unique_ids"]),
            })

        valid_countries = unique_ids_per_country[
            unique_ids_per_country["unique_ids"] >= MIN_UNIQUE_IDS_PER_COUNTRY
        ]["country_code"]

        df_in = df_in[df_in["country_code"].isin(valid_countries)].copy()
        
        # NOTE: We drop 'country_code' here for cleanliness, 
        # but we will need to regenerate it later for the trimming step.
        df_in.drop(columns=["country_code"], inplace=True)

        # LOG: Country Filter
        r, fy = get_counts(df_in)
        file_tracker.append({"Step": "2. Country Filter", "Rows": r, "FirmYears": fy})

        if df_in.empty:
            print("  WARNING: Entire dataset removed by country filter.")
            continue

        # ---------------------------------------------------------------------
        # 2. TARGET COLUMN DETECTION AND NORMALIZATION
        # ---------------------------------------------------------------------
        base_target = input_path.stem.lower()
        anomaly_target = f"{base_target}_anomaly"
        normalized_target = base_target

        if base_target in df_in.columns:
            target_col = base_target
            print(f"  Target column found: '{target_col}'")
        elif anomaly_target in df_in.columns:
            target_col = anomaly_target
            print(f"  Base target '{base_target}' not found. Using '{target_col}'.")
        else:
            raise KeyError(f"Neither '{base_target}' nor '{anomaly_target}' found.")

        # ---------------------------------------------------------------------
        # 3. APPLY CLEANING (With Internal Tracking)
        # ---------------------------------------------------------------------
        # We pass file_tracker so the function can append steps directly
        result_clean = clean_single_result(df_in, target_col, file_tracker)

        if target_col != normalized_target and target_col in result_clean.columns:
            result_clean = result_clean.rename(columns={target_col: normalized_target})
            print(f"  Renamed column '{target_col}' to '{normalized_target}'.")

        # ---------------------------------------------------------------------
        # 4. ANOMALY-SPECIFIC FILTERS
        # ---------------------------------------------------------------------
        value_col = normalized_target
        filter_name = "None"

        if value_col.lower() == "es":
            before = len(result_clean)
            result_clean = result_clean[result_clean[value_col] <= 600].reset_index(drop=True)
            filter_name = "ES (<= 600)"
            print(f"  ES filter applied: {before - len(result_clean)} rows dropped.")

        elif value_col.lower() == "poa":
            before = len(result_clean)
            result_clean = result_clean[
                (result_clean[value_col] <= 6000) & (result_clean[value_col] >= -50)
            ].reset_index(drop=True)
            filter_name = "POA (-50 to 6000)"
            print(f"  POA filter applied: {before - len(result_clean)} rows dropped.")

        elif value_col.lower() == "pta":
            before = len(result_clean)
            result_clean = result_clean[
                (result_clean[value_col] <= 6000) & (result_clean[value_col] >= -20)
            ].reset_index(drop=True)
            filter_name = "PTA (-20 to 6000)"
            print(f"  PTA filter applied: {before - len(result_clean)} rows dropped.")

        # LOG: Anomaly Filter
        r, fy = get_counts(result_clean)
        file_tracker.append({"Step": f"3. Anomaly Filter [{filter_name}]", "Rows": r, "FirmYears": fy})

        # ---------------------------------------------------------------------
        # 5. REMOVE DUPLICATES (Earliest PIT Date)
        # ---------------------------------------------------------------------
        if "PIT Date" in result_clean.columns:
            result_clean["PIT Date"] = pd.to_datetime(result_clean["PIT Date"], errors="coerce")
            
            result_clean = result_clean.sort_values(
                by=["ID", "FiscalPeriod", value_col, "PIT Date"]
            )

            before_dups = len(result_clean)
            result_clean = result_clean.drop_duplicates(
                subset=["ID", "FiscalPeriod", value_col],
                keep="first"
            ).reset_index(drop=True)
            print(f"  Removed {before_dups - len(result_clean)} duplicates.")
            
            # LOG: Dedup
            r, fy = get_counts(result_clean)
            file_tracker.append({"Step": "4. Dedup (Earliest PIT)", "Rows": r, "FirmYears": fy})
        else:
            r, fy = get_counts(result_clean)
            file_tracker.append({"Step": "4. Dedup (Skipped - No PIT)", "Rows": r, "FirmYears": fy})

        # ---------------------------------------------------------------------
        # 6. [NEW] OUTLIER TRIMMING (Top/Bottom 0.5% per Country)
        # ---------------------------------------------------------------------
        # Re-derive country_code because it was dropped earlier
        result_clean["country_code"] = result_clean["ID"].astype(str).str[1:4]
        
        # Calculate quantiles per country
        # 0.5% = 0.005, 99.5% = 0.995
        lower_q = result_clean.groupby("country_code")[value_col].transform(lambda x: x.quantile(0.005))
        upper_q = result_clean.groupby("country_code")[value_col].transform(lambda x: x.quantile(0.995))
        
        before_trim = len(result_clean)
        
        # Filter: Keep rows that are between lower and upper bounds (inclusive)
        # Note: Using inclusive bounds. Adjust to exclusive if strictly desired.
        mask_keep = (result_clean[value_col] >= lower_q) & (result_clean[value_col] <= upper_q)
        result_clean = result_clean[mask_keep].reset_index(drop=True)
        
        # Clean up helper column
        result_clean.drop(columns=["country_code"], inplace=True)
        
        print(f"  Trimmed {before_trim - len(result_clean)} outlier rows (Top/Bottom 0.5% per country).")
        
        # LOG: Trim
        r, fy = get_counts(result_clean)
        file_tracker.append({"Step": "5. Outlier Trim (0.5%)", "Rows": r, "FirmYears": fy})

        # ---------------------------------------------------------------------
        # PRINT LOSS REPORT
        # ---------------------------------------------------------------------
        df_tracker = pd.DataFrame(file_tracker)
        
        # Calculate drops
        df_tracker["Rows Dropped"] = df_tracker["Rows"].diff().fillna(0) * -1
        df_tracker["FY Dropped"] = df_tracker["FirmYears"].diff().fillna(0) * -1
        
        print("\n" + "-"*40)
        print("DATA LOSS ANALYSIS REPORT")
        print("-" * 40)
        print(df_tracker[["Step", "Rows", "Rows Dropped", "FirmYears", "FY Dropped"]].to_string(index=False))
        print("-" * 40)

        # ---------------------------------------------------------------------
        # SAVE CLEANED OUTPUT
        # ---------------------------------------------------------------------
        output_path = input_path.with_name(f"{input_path.stem}_clean{input_path.suffix}")
        result_clean.to_csv(output_path, sep="|", index=False)
        print(f"  Saved to: {output_path}")

        # ---------------------------------------------------------------------
        # PREVIEW
        # ---------------------------------------------------------------------
        preview_cols = [
            c for c in ["ID", "HistCurrency", "PIT Date", "FiscalPeriod", value_col]
            if c in result_clean.columns
        ]
        print("\nPreview:")
        print(result_clean[preview_cols].head(5))

    except Exception as e:
        print(f"  ERROR processing {input_path.name}: {e}")

# =============================================================================
# COUNTRY REMOVAL SUMMARY (GLOBAL)
# =============================================================================
if country_removal_log:
    country_removal_log_df = (
        pd.DataFrame(country_removal_log)
        .sort_values(by=["dataset", "country_code"])
        .reset_index(drop=True)
    )
    print("\nSUMMARY OF REMOVED COUNTRIES (INSUFFICIENT UNIQUE FIRMS):")
    print(country_removal_log_df)
else:
    print("\nNo countries were removed in any dataset.")


Processing file: ACC.txt
  Rows BEFORE: 4218013
  Firm-years BEFORE: 886391
  Target column found: 'acc'
  Using target column: 'acc'
  Dropping 5 rows with ±infinity in 'acc'.
  Dropped 3666407 empty rows in 'acc'.
  Dropped 5265 stale rows (>3 years old).
  Removed 1601 duplicates.
  Trimmed 4456 outlier rows (Top/Bottom 0.5% per country).

----------------------------------------
DATA LOSS ANALYSIS REPORT
----------------------------------------
                     Step    Rows  Rows Dropped  FirmYears  FY Dropped
              1. Raw Load 4218013          -0.0     886391        -0.0
        2. Country Filter 4190972       27041.0     880071      6320.0
   Clean: Remove Infinity 4190967           5.0     880071        -0.0
 Clean: Remove NaNs/Empty  524560     3666407.0     398911    481160.0
Clean: Valid FiscalPeriod  524560          -0.0     398911        -0.0
 Filter: Best FP per Date  446396       78164.0     364241     34670.0
   Filter: Stale (>3 Yrs)  441131        5265.0  

# 3.0. Factors

## Data Preparation

### Create Base Dataset (Fundamentals, PIT Logic)

In [117]:
"""
Summary:
This cell
1) reads multiple input files from OUTPUT_DIR,
2) builds a base table with all unique combinations of
   (ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period),
3) for each input file, performs an "as-of" merge:
   for every base row it takes the latest AnnPITValue from that dataset
   with the same (ID, HistCurrency, FiscalPeriod) and PIT Date <= base PIT Date.
   If the dataset has AnnPITValue_Period, the merge is also grouped by that
   period label; if it does NOT have AnnPITValue_Period, it is treated as
   period-agnostic ("can fit all" periods).
4) writes the final combined view to OUTPUT_FILE in OUTPUT_DIR.

Additional tracking added:
- For each imported dataset, the code prints how many unique
  firm-year combinations exist (unique ID × FiscalPeriod).
- After all merging is complete, the code counts how many
  firm-year combinations have **no missing values** across ANY
  of the added value columns.
- At the end, the code reports how many missing values exist for each
  value column (ca, cce, cl, etc.).

The final table has the columns:
    ID, PIT Date, HistCurrency, FiscalPeriod, AnnPITValue_Period,
    Value1, Value2, ..., ValueN
where each ValueX is defined by VALUE_COLUMN_NAMES in the config.
"""

from pathlib import Path
import time
import pandas as pd

# === CONFIG ===

OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILES = [
    "Total_Assets.txt",
    "Net_Sales_or_Revenues.txt",
    "Cost_of_Goods_Sold_Excl_Depreciation.txt",
    "Selling_General__Administrative_Expenses.txt",
    "Interest_Expense___Total.txt",
    "Common_Equity.txt",
    "Deferred_Taxes.txt",

    
    # add more file names here if needed ...
]

VALUE_COLUMN_NAMES = [
    "at",
    "rev",
    "cogs",
    "sga",
    "int",
    "ce",
    "dt",
    
    # add more names here, one for each input file ...
]

OUTPUT_FILE = "data_Factors.txt"

ID_COL = "ID"
PIT_DATE_COL = "PIT Date"
HIST_CURR_COL = "HistCurrency"
FISCAL_PER_COL = "FiscalPeriod"
VALUE_COL = "AnnPITValue"
PERIOD_COL = "AnnPITValue_Period"   # NEW: period label column

BASE_COLS = [ID_COL, PIT_DATE_COL, HIST_CURR_COL, FISCAL_PER_COL]

SAVE_INTERMEDIATE = False
# ==============


# --- SANITY CHECK ---
if not OUTPUT_DIR.exists():
    raise FileNotFoundError(
        f"OUTPUT_DIR does not exist:\n{OUTPUT_DIR}\n"
        f"Please make sure Temp_file_path_A is set correctly."
    )
print("Using data folder:", OUTPUT_DIR.resolve())
# ---------------------


def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a single dataset from path and keep only the relevant columns.

    The function:
    - checks if the file exists,
    - reads it using the configured separator,
    - checks that all required columns are present,
    - keeps AnnPITValue_Period if present (and creates it as NA if not),
    - converts PIT Date to datetime,
    - casts ID to string,
    - converts AnnPITValue to numeric.
    """
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_csv(path, sep=SEP)

    needed_cols = BASE_COLS + [VALUE_COL]
    missing = [c for c in needed_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Required columns {missing} are missing in file: {path}")

    # Keep base + AnnPITValue + (optional) AnnPITValue_Period
    extra_cols = [PERIOD_COL] if PERIOD_COL in df.columns else []
    df = df[needed_cols + extra_cols].copy()

    # Ensure AnnPITValue_Period exists in all datasets
    if PERIOD_COL not in df.columns:
        df[PERIOD_COL] = pd.NA

    # Re-order columns for consistency
    df = df[BASE_COLS + [PERIOD_COL, VALUE_COL]]

    # Ensure consistent data types
    df[PIT_DATE_COL] = pd.to_datetime(df[PIT_DATE_COL])
    df[ID_COL] = df[ID_COL].astype(str)
    df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")

    return df


def build_base_frame(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Build the base dataset from a list of dataframes.

    Steps:
    - stack the identifier columns (including AnnPITValue_Period) from all dataframes,
    - drop duplicate combinations,
    - enforce correct data types,
    - sort the base dataset by (ID, HistCurrency, FiscalPeriod, AnnPITValue_Period, PIT Date).

    The result is the "skeleton" upon which all value columns will be merged.
    """
    # Concatenate identifier columns from all datasets (base cols + period col)
    base = pd.concat(
        [df[BASE_COLS + [PERIOD_COL]] for df in dfs],
        ignore_index=True
    )

    # Remove duplicate rows of identifiers
    base = base.drop_duplicates().reset_index(drop=True)

    # Ensure consistent types
    base[PIT_DATE_COL] = pd.to_datetime(base[PIT_DATE_COL])
    base[ID_COL] = base[ID_COL].astype(str)

    # Sort by identifier columns and date
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    base = base.sort_values(sort_cols).reset_index(drop=True)

    return base


def asof_merge_one(base: pd.DataFrame, df: pd.DataFrame, new_col_name: str) -> pd.DataFrame:
    """
    Perform an "as-of" merge of one dataset into the base dataframe.

    Concept:
    - For each combination of (ID, HistCurrency, FiscalPeriod)
      and for each PIT Date in the base table,
      we want the latest AnnPITValue from df with
          df.PIT Date <= base.PIT Date
      for the same (ID, HistCurrency, FiscalPeriod).
    - If df has a non-empty AnnPITValue_Period column, the as-of grouping
      is also done by AnnPITValue_Period.
      If AnnPITValue_Period is all missing (e.g. revenue), df is treated
      as period-agnostic and can "fit all" periods.

    Implementation (vectorized, without merge_asof):
      (same as before, but grouping keys dynamically include PERIOD_COL
       only for period-aware datasets)
    """
    # Determine if this dataset is period-aware (has any non-NA period labels)
    has_period = PERIOD_COL in df.columns and df[PERIOD_COL].notna().any()

    # Work on copies to avoid modifying original base/df
    base_tmp = base.copy()
    base_tmp[new_col_name] = pd.NA
    base_tmp["__marker"] = "base"

    # Keep only identifier columns and the value column from df, then rename
    if has_period:
        df_tmp = df[BASE_COLS + [PERIOD_COL, VALUE_COL]].copy()
    else:
        df_tmp = df[BASE_COLS + [VALUE_COL]].copy()
        # Ensure PERIOD_COL exists but remains NA (period-agnostic)
        df_tmp[PERIOD_COL] = pd.NA

    df_tmp = df_tmp.rename(columns={VALUE_COL: new_col_name})
    df_tmp["__marker"] = "df"

    # Ensure consistent types for safety
    base_tmp[ID_COL] = base_tmp[ID_COL].astype(str)
    df_tmp[ID_COL] = df_tmp[ID_COL].astype(str)
    base_tmp[PIT_DATE_COL] = pd.to_datetime(base_tmp[PIT_DATE_COL])
    df_tmp[PIT_DATE_COL] = pd.to_datetime(df_tmp[PIT_DATE_COL])

    base_tmp[new_col_name] = pd.to_numeric(base_tmp[new_col_name], errors="coerce")
    df_tmp[new_col_name] = pd.to_numeric(df_tmp[new_col_name], errors="coerce")

    # Concatenate base rows and df rows
    combined = pd.concat([base_tmp, df_tmp], ignore_index=True)

    # Define an order so that df rows come before base rows on the same PIT Date
    marker_order = {"df": 0, "base": 1}
    combined["__order"] = combined["__marker"].map(marker_order).astype("int8")

    # Build sort and group keys
    group_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]
    sort_cols = [ID_COL, HIST_CURR_COL, FISCAL_PER_COL]

    if has_period:
        # Period-aware dataset: also group by AnnPITValue_Period
        group_cols.append(PERIOD_COL)
        sort_cols.append(PERIOD_COL)

    sort_cols += [PIT_DATE_COL, "__order"]

    # Sort accordingly
    combined = combined.sort_values(sort_cols)

    # Forward-fill the value within each group to implement "as-of" logic
    combined[new_col_name] = combined.groupby(group_cols)[new_col_name].ffill()

    # Keep only rows that belong to the base dataset
    result = combined[combined["__marker"] == "base"].copy()

    # Drop helper columns and sort final result
    result = result.drop(columns=["__marker", "__order"])
    result = result.sort_values(
        [ID_COL, HIST_CURR_COL, FISCAL_PER_COL, PERIOD_COL, PIT_DATE_COL]
    ).reset_index(drop=True)

    return result


def build_and_save_variable(
    input_files,
    value_column_names,
    output_file,
    save_intermediate: bool = False,
) -> Path:
    """
    Build the final "view" for a variable based on multiple input files and save it.

    Steps:
    1) Validate arguments (non-empty, same length for files and column names).
    2) Load and preprocess each input file.
    3) Build the base dataset with all unique identifier combinations
       (including AnnPITValue_Period).
    4) For each input dataframe, perform an as-of merge of its AnnPITValue into the base,
       using AnnPITValue_Period in the grouping if present in that dataset.
    5) Keep only the base columns and the value columns.
    6) Write the final result to output_file in OUTPUT_DIR.
    """
    if len(input_files) == 0:
        raise ValueError("No INPUT_FILES were provided.")
    if len(input_files) != len(value_column_names):
        raise ValueError("INPUT_FILES and VALUE_COLUMN_NAMES must have the same length.")

    start_total = time.time()

    # Build full paths
    paths = [OUTPUT_DIR / f for f in input_files]

    print("\n--- Loading input files ---")
    t0 = time.time()
    dfs = [load_dataset(p) for p in paths]
    print(f"Loading and preprocessing finished in {time.time() - t0:.1f} seconds.")

    # Unique firm-year counts per input dataset
    print("\n--- Unique firm-year (ID, FiscalPeriod) counts per input file ---")
    for path, df in zip(paths, dfs):
        n_firm_years = df[[ID_COL, FISCAL_PER_COL]].drop_duplicates().shape[0]
        print(f"{path.name}: {n_firm_years:,} unique (ID, FiscalPeriod) combinations")

    print("\n--- Building base dataset ---")
    t0 = time.time()
    base = build_base_frame(dfs)
    print(
        f"Base dataset has {len(base):,} rows and was built in "
        f"{time.time() - t0:.1f} seconds."
    )

    print("\n--- Starting as-of merges ---")
    result = base
    for idx, (df, col_name) in enumerate(zip(dfs, value_column_names), start=1):
        print(f"[{idx}/{len(dfs)}] Merging value column '{col_name}' ...")
        t_merge = time.time()
        result = asof_merge_one(result, df, col_name)
        print(
            f"    Done in {time.time() - t_merge:.1f} seconds. "
            f"Result currently has {len(result):,} rows."
        )

        if save_intermediate:
            stem = output_file.rsplit(".", 1)[0]
            temp_out = OUTPUT_DIR / f"{stem}_partial_{idx}.txt"
            result.to_csv(temp_out, sep=SEP, index=False)
            print(f"    Intermediate file written to: {temp_out}")

    # Keep only the base identifier columns (including period) and the value columns
    final_cols = BASE_COLS + [PERIOD_COL] + value_column_names
    result = result[final_cols]

    # Final stats on firm-years and missing values
    print("\n--- Final dataset statistics ---")
    mask_complete = result[value_column_names].notna().all(axis=1)
    complete_firm_years = (
        result.loc[mask_complete, [ID_COL, FISCAL_PER_COL]]
        .drop_duplicates()
    )
    n_complete_firm_years = complete_firm_years.shape[0]
    print(
        f"Unique (ID, FiscalPeriod) combinations with ALL value columns non-empty: "
        f"{n_complete_firm_years:,}"
    )

    print("\nEmpty (NaN) values per value column:")
    for col in value_column_names:
        n_missing = result[col].isna().sum()
        print(f"  - {col}: {n_missing:,} empty values")

    # Final output
    out_path = OUTPUT_DIR / output_file
    result.to_csv(out_path, sep=SEP, index=False)

    print(f"\nFinal view written to:\n{out_path.resolve()}")
    print(f"Total runtime: {time.time() - start_total:.1f} seconds.")

    return out_path


# --- Execution ---
out_path = build_and_save_variable(
    input_files=INPUT_FILES,
    value_column_names=VALUE_COLUMN_NAMES,
    output_file=OUTPUT_FILE,
    save_intermediate=SAVE_INTERMEDIATE,
)


Using data folder: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies

--- Loading input files ---
Loading and preprocessing finished in 12.1 seconds.

--- Unique firm-year (ID, FiscalPeriod) counts per input file ---
Total_Assets.txt: 871,391 unique (ID, FiscalPeriod) combinations
Net_Sales_or_Revenues.txt: 817,863 unique (ID, FiscalPeriod) combinations
Cost_of_Goods_Sold_Excl_Depreciation.txt: 818,491 unique (ID, FiscalPeriod) combinations
Selling_General__Administrative_Expenses.txt: 721,610 unique (ID, FiscalPeriod) combinations
Interest_Expense___Total.txt: 108,197 unique (ID, FiscalPeriod) combinations
Common_Equity.txt: 883,328 unique (ID, FiscalPeriod) combinations
Deferred_Taxes.txt: 750,382 unique (ID, FiscalPeriod) combinations

--- Building base dataset ---
Base dataset has 4,436,437 rows and was built in 10.6 seconds.

--- Starting as-of merges ---
[1/7] Merging value column 'at' ...
    Done in 8.7 seconds. Result currently has 4,436,437 rows.
[2/7] Merging value co

### Calculate In Between Variables

In [118]:
# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file (INPUT_FILE) from OUTPUT_DIR.
# 2) Creates a new column 'be'.
# 3) LOGIC:
#    - be = ce + dt (if both exist)
#    - be = ce      (if ce exists, but dt is missing)
#    - be = NaN     (if ce is missing, even if dt exists)
# 4) The updated DataFrame is saved to disk.
# =============================================================================

from pathlib import Path
import pandas as pd
import numpy as np

# ========================= CONFIG ============================================
OUTPUT_DIR = Path(Temp_file_path_A)
SEP = "|"

INPUT_FILE = "data_Factors.txt"               
NEW_COLUMN = "be"
# ============================================================================

# 1) LOAD INPUT FILE
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

df = pd.read_csv(path, sep=SEP)

# 2) CHECK THAT MANDATORY COLUMNS EXIST IN DATAFRAME
# 'ce' is strictly required for the calculation logic to valid
required_cols = ["ce", "dt"]
missing_cols = [c for c in required_cols if c not in df.columns]
if missing_cols:
    raise ValueError(f"{INPUT_FILE}: missing columns: {missing_cols}")

# 3) INITIALIZE NEW COLUMN WITH NaN
df_out = df.copy()
df_out[NEW_COLUMN] = np.nan

# 4) DEFINE CALCULATION LOGIC
# Rule: Calculation is valid ONLY if 'ce' is not null.
mask_valid_rows = df_out["ce"].notna()

# Create a temporary 'dt' series where NaNs are treated as 0.
# We do not overwrite the original 'dt' in df_out, we just use this for the math.
dt_safe_for_math = df_out["dt"].fillna(0)

# 5) APPLY CALCULATION
# We take 'ce' and add the safe version of 'dt'.
# Because we filter with df_out.loc[mask_valid_rows], rows where 'ce' is NaN 
# remain NaN in the NEW_COLUMN.
df_out.loc[mask_valid_rows, NEW_COLUMN] = (
    df_out.loc[mask_valid_rows, "ce"] + dt_safe_for_math.loc[mask_valid_rows]
)

# 6) SAVE RESULT
formula_output_filename = f"{INPUT_FILE}"
formula_output_path = OUTPUT_DIR / formula_output_filename
df_out.to_csv(formula_output_path, sep=SEP, index=False)

print(f"New column '{NEW_COLUMN}' created.")
print("Logic applied: be = ce + dt (where dt is 0 if missing). If ce missing -> NaN.")
print("Rows calculated:", mask_valid_rows.sum())
print("Result was saved to:", formula_output_path)
print("-" * 30)
print(df_out[["ce", "dt", NEW_COLUMN]].head(20))

New column 'be' created.
Logic applied: be = ce + dt (where dt is 0 if missing). If ce missing -> NaN.
Rows calculated: 2420143
Result was saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/data_Factors.txt
------------------------------
            ce        dt          be
0   378.411011       NaN  378.411011
1          NaN       NaN         NaN
2   434.756779       NaN  434.756779
3          NaN       NaN         NaN
4   518.204722       NaN  518.204722
5          NaN       NaN         NaN
6   386.142780       NaN  386.142780
7          NaN       NaN         NaN
8   387.109014       NaN  387.109014
9          NaN       NaN         NaN
10  292.572607  0.000000  292.572607
11         NaN       NaN         NaN
12         NaN       NaN         NaN
13   72.674506  1.902010   74.576516
14         NaN       NaN         NaN
15  112.352977  1.252426  113.605403
16         NaN       NaN         NaN
17         NaN       NaN         NaN
18    0.129111       NaN    0.129111
19    

### Add Lagged Columns (PIT Logic)

In [119]:
from pathlib import Path
import pandas as pd

# =============================================================================
# SUMMARY
# --------
# This cell:
# 1) Loads an input file from OUTPUT_DIR.
# 2) Computes, for each column in VALUE_COLUMNS, a lagged column "<col>_lag1".
#    - For each (ID, HistCurrency) group, sorted by PIT Date and FiscalPeriod,
#      "<col>_lag1" is the most recent known value of <col> for
#      FiscalPeriod - 1 *and the same AnnPITValue_Period* based on all PIT
#      updates observed up to that row.
#      Example: 2020 Q1 will lag to 2019 Q1 (same period label), not to any
#      value from 2020.
# 3) Keeps all rows (no deletions), only converts types and appends lag columns.
# 4) Saves the resulting DataFrame to the same folder as:
#       processed_<INPUT_FILE>
#    e.g. INPUT_FILE = "ag.txt"  ->  "processed_ag.txt"
#
# The final result is stored in the variable `result`.
# =============================================================================

# ================= CONFIG =================
OUTPUT_DIR = Path(Temp_file_path_A)   # Base directory for input/output files
SEP = "|"                             # Delimiter used in the text files

INPUT_FILE = "data_Factors.txt"          # Name of the input file to read
VALUE_COLUMNS = ["at"]         # Columns for which to compute the lag "<col>_lag1"

PERIOD_COL = "AnnPITValue_Period"    # NEW: period label used for lag (Q1, Q2, A, etc.)
# ==========================================


# =============================================================================
# 1) LOAD INPUT FILE (NO ROWS DROPPED)
# =============================================================================
path = OUTPUT_DIR / INPUT_FILE
if not path.exists():
    raise FileNotFoundError(f"{INPUT_FILE}: file not found in {OUTPUT_DIR}")

# Read raw data
df = pd.read_csv(path, sep=SEP)

# Required base columns for the PIT logic
required_base_cols = ["ID", "PIT Date", "HistCurrency", "FiscalPeriod", PERIOD_COL]
missing_base = [c for c in required_base_cols if c not in df.columns]
if missing_base:
    raise ValueError(f"{INPUT_FILE}: missing required base columns {missing_base}")

# Check that all requested value columns exist
missing_value_cols = [c for c in VALUE_COLUMNS if c not in df.columns]
if missing_value_cols:
    raise ValueError(
        f"{INPUT_FILE}: missing value columns specified in VALUE_COLUMNS: {missing_value_cols}"
    )

# Type casting only, no row drops
df["PIT Date"] = pd.to_datetime(df["PIT Date"], errors="coerce")
df["FiscalPeriod"] = pd.to_numeric(df["FiscalPeriod"], errors="coerce")
# Make sure period labels are strings (or NaN)
df[PERIOD_COL] = df[PERIOD_COL].astype("string")

# Work on a copy to keep the original df untouched
result = df.copy()

print("Input dataset loaded into `result` (no rows dropped).")
print("Columns in result:")
print(list(result.columns))


# =============================================================================
# 2) COMPUTE LAG COLUMNS "<col>_lag1" FOR EACH COLUMN IN VALUE_COLUMNS
# =============================================================================
# NEW LOGIC:
# For each group (ID, HistCurrency) along the PIT timeline:
# - We maintain, for each (FiscalPeriod, AnnPITValue_Period) pair, the latest PIT
#   and latest known values for VALUE_COLUMNS.
# - For a row with (FP = t, period = P), the lag "<col>_lag1" is the last known
#   value of <col> for (FP = t-1, period = P) based on all PIT updates observed
#   up to (and including) the current PIT Date.
#
# If AnnPITValue_Period is missing, we fall back to using only FiscalPeriod
# (i.e., we treat the period label as None).

df_calc = result.copy()

def compute_lags_for_group(group: pd.DataFrame) -> pd.DataFrame:
    """
    Compute lagged values "<col>_lag1" for all columns in VALUE_COLUMNS
    within a single (ID, HistCurrency) group, using:
      - FiscalPeriod (year), and
      - AnnPITValue_Period (period label) for matching the lag.
    """
    # Sort chronologically so updates are processed in correct temporal order
    group = group.sort_values(["PIT Date", "FiscalPeriod"], ascending=[True, True])

    # last_for_fp stores, for each (FiscalPeriod, PeriodLabel), the latest PIT
    # and the known values for VALUE_COLUMNS.
    # Structure:
    #   last_for_fp = {
    #       (fp, period_label): (last_pit, {col: last_value_for_col, ...})
    #   }
    last_for_fp = {}

    # For each value column, we collect the lag values row by row
    lag_values = {col: [] for col in VALUE_COLUMNS}

    # Iterate through rows in PIT and FiscalPeriod order
    for _, row in group.iterrows():
        pit = row["PIT Date"]
        fp = row["FiscalPeriod"]
        period_label_raw = row.get(PERIOD_COL, pd.NA)
        # Use None as key for missing labels to keep keys hashable and consistent
        period_label = None if pd.isna(period_label_raw) else str(period_label_raw)

        # 1) Determine lag values for each column in VALUE_COLUMNS
        for col in VALUE_COLUMNS:
            lag_val = None
            if pd.notna(fp):
                target_fp = fp - 1
                target_key = (target_fp, period_label)
                info = last_for_fp.get(target_key)
                if info is not None:
                    # info[1] is the dict of last known values for that target (FP, period_label)
                    lag_val = info[1].get(col)
            lag_values[col].append(lag_val)

        # 2) Update our knowledge for the current (FiscalPeriod, period_label)
        if pd.notna(fp) and pd.notna(pit):
            key_curr = (fp, period_label)
            prev = last_for_fp.get(key_curr)

            # Previously stored values for this key (if any)
            prev_values = prev[1] if prev is not None else {}

            # Create a copy so we do not mutate the previous dict reference
            updated_values = dict(prev_values)

            # Update with any non-NaN values from this row
            for col in VALUE_COLUMNS:
                v = row[col]
                if pd.notna(v):
                    updated_values[col] = v

            # Only overwrite if this PIT is newer or equal to the previous PIT
            if prev is None or pit >= prev[0]:
                last_for_fp[key_curr] = (pit, updated_values)

    # 3) Attach lag columns "<col>_lag1" to the group DataFrame
    for col in VALUE_COLUMNS:
        group[f"{col}_lag1"] = lag_values[col]

    return group


# Apply the lag computation per (ID, HistCurrency) group
df_calc = (
    df_calc
    .groupby(["ID", "HistCurrency"], dropna=False, group_keys=False)
    .apply(compute_lags_for_group)
)

# Optionally convert FiscalPeriod back to Int64 (nullable integer)
df_calc["FiscalPeriod"] = pd.to_numeric(df_calc["FiscalPeriod"], errors="coerce").astype("Int64")

# Final result
result = df_calc

print("\nLag columns created and added to `result`:")
print([c for c in result.columns if any(c == v or c == f"{v}_lag1" for v in VALUE_COLUMNS)])

# Optional preview
cols_to_show = [
    "ID", "HistCurrency", "PIT Date", "FiscalPeriod", PERIOD_COL
] + VALUE_COLUMNS + [f"{c}_lag1" for c in VALUE_COLUMNS]
print(result[cols_to_show].head(40))


# =============================================================================
# 3) SAVE RESULTING FILE AS "processed_<INPUT_FILE>" IN THE SAME FOLDER
# =============================================================================
output_filename = f"processed_{INPUT_FILE}"
output_path = OUTPUT_DIR / output_filename

result.to_csv(output_path, sep=SEP, index=False)

print(f"\nProcessed file saved to: {output_path}")


Input dataset loaded into `result` (no rows dropped).
Columns in result:
['ID', 'PIT Date', 'HistCurrency', 'FiscalPeriod', 'AnnPITValue_Period', 'at', 'rev', 'cogs', 'sga', 'int', 'ce', 'dt', 'be']


  .apply(compute_lags_for_group)
  .apply(compute_lags_for_group)



Lag columns created and added to `result`:
['at', 'at_lag1']
           ID HistCurrency   PIT Date  FiscalPeriod AnnPITValue_Period  \
0   C02500770          Ars 1995-12-29          1992                  A   
1   C02500770          Ars 1995-12-29          1992               <NA>   
2   C02500770          Ars 1995-12-29          1993                  A   
3   C02500770          Ars 1995-12-29          1993               <NA>   
4   C02500770          Ars 1995-12-29          1994                  A   
5   C02500770          Ars 1995-12-29          1994               <NA>   
6   C02500770          Ars 1996-05-03          1995                  A   
7   C02500770          Ars 1996-05-03          1995               <NA>   
8   C02500770          Ars 1998-07-03          1996                  A   
9   C02500770          Ars 1998-07-03          1996               <NA>   
10  C02500770          Ars 1998-07-03          1997                  A   
11  C02500770          Ars 1998-07-03          199

### Merge with Market Data (PIT Logic, Forward Filled until Update)

In [120]:
# =============================================================================
# MERGE BENCHMARK WITH FACTORS (Forward Fill / As-Of Logic)
#
# Inputs:
#   - FF_Benchmark_data_clean.txt (Benchmark - Daily)
#   - processed_data_Factors.txt (Factors - Sparse)
#   - (REMOVED) CurrencyMapping.txt
#
# Logic:
#   1. Factors Deduplication (Simplified priority logic).
#   2. MERGE ASOF (Backward): For every Benchmark day, find the most recent 
#      past Factor row for that ID. This "writes forward" the data.
#   3. (REMOVED) Post-Merge Filter
#   4. Track dropped rows.
#
# Output:
#   - FF_Benchmark_Factors_Merged_Clean.txt
#   - DroppedFromFFMerge.txt
# =============================================================================

import pandas as pd
import numpy as np
import gc

# =====================================================================
# CONFIGURATION
# =====================================================================
benchmark_file = f"{Temp_file_path_DP}/FF_Benchmark_data_clean.txt"
factors_file   = f"{Temp_file_path_A}/processed_data_Factors.txt"
# mapping_file   = f"{Input_file_path}/CurrencyMapping.txt" # REMOVED

merged_clean_file = f"{Temp_file_path_A}/FF_Benchmark_Factors_Merged_Clean.txt"
dropped_log_file  = f"{Temp_file_path_A}/DroppedFromFFMerge.txt"

print(f"Benchmark: {benchmark_file}")
print(f"Factors:   {factors_file}")
# print(f"Mapping:   {mapping_file}")
print(f"Output:    {merged_clean_file}")

# =====================================================================
# STEP 1: Load Currency Mapping (SKIPPED)
# =====================================================================
# print("\nLoading Currency Mapping...")
# map_df = pd.read_csv(...)
# valid_country_hist_set = ...
# valid_full_combo_set   = ...
# print(f"Loaded {len(map_df)} mapping rules.")

# =====================================================================
# STEP 2: Load & Deduplicate Factors
# =====================================================================
print("\nLoading and Deduplicating Factors Data...")
df_fact = pd.read_csv(factors_file, sep="|", dtype="string", engine="c")

# 2a. Convert Date (Crucial for merge_asof)
print("Converting Factor dates to datetime...")
df_fact["PIT Date"] = pd.to_datetime(df_fact["PIT Date"])

# 2b. Sorting Helpers
df_fact["_Country"] = df_fact["ID"].str[1:4]
df_fact["_FiscalPeriod_Num"] = pd.to_numeric(df_fact["FiscalPeriod"], errors='coerce').fillna(-9999)
# Fix for boolean ambiguity: fillna(False) before casting
is_a_mask = (df_fact["AnnPITValue_Period"] == "A").fillna(False)
df_fact["_IsAnnPIT_A"] = is_a_mask.astype(int)

# ### REMOVED: Check valid history currency (Requires mapping file)
# fact_tuples = zip(df_fact["_Country"], df_fact["HistCurrency"])
# df_fact["_IsValidHistCurr"] = [1 if t in valid_country_hist_set else 0 for t in fact_tuples]

# 2c. Sort and Deduplicate
print("Sorting factors to resolve duplicates...")
# REMOVED "_IsValidHistCurr" from sort keys
df_fact.sort_values(
    by=["ID", "PIT Date", "_FiscalPeriod_Num", "_IsAnnPIT_A"], # removed _IsValidHistCurr
    ascending=[True, True, False, False],                      # removed False
    inplace=True
)

print("Dropping duplicates (keeping top row per ID x PIT Date)...")
df_fact_dedup = df_fact.drop_duplicates(subset=["ID", "PIT Date"], keep="first")
print(f"Unique Factor Dates (ID/Date pairs): {len(df_fact_dedup):,}")

# Clean up helpers
# REMOVED "_IsValidHistCurr" from drops
cols_to_drop = ["_Country", "_FiscalPeriod_Num", "_IsAnnPIT_A"]
df_fact_dedup = df_fact_dedup.drop(columns=cols_to_drop)

# Sort strictly by Date for merge_asof
df_fact_dedup = df_fact_dedup.sort_values("PIT Date")

del df_fact, is_a_mask # removed fact_tuples
gc.collect()

# =====================================================================
# STEP 3: Load Benchmark & Merge As-Of (Forward Fill)
# =====================================================================
print("\nLoading Benchmark Data...")
df_bench = pd.read_csv(benchmark_file, sep="|", dtype="string", engine="c")
print(f"Benchmark rows: {len(df_bench):,}")

print("Converting Benchmark dates to datetime...")
df_bench["DayDate"] = pd.to_datetime(df_bench["DayDate"])

print("Sorting Benchmark by DayDate (Required for merge_asof)...")
df_bench = df_bench.sort_values("DayDate")

print("Merging (As-Of Backward: Writing Forward)...")
# merge_asof does NOT support 'indicator=True', so we must detect drops manually later.
merged = pd.merge_asof(
    df_bench,
    df_fact_dedup,
    left_on="DayDate",
    right_on="PIT Date",
    by="ID",                # Match strictly on ID
    direction="backward"    # Look for last available date <= Benchmark Date
)

# Free input memory
del df_bench, df_fact_dedup
gc.collect()

# =====================================================================
# STEP 4: Post-Merge Filtering
# =====================================================================
print("\nApplying Post-Merge Filters...")

# Initialize DropReason as OBJECT type to prevent FutureWarning
merged["DropReason"] = np.nan
merged["DropReason"] = merged["DropReason"].astype("object")

# Condition A: No Factor Data Found
merged.loc[merged["PIT Date"].isna(), "DropReason"] = "No Factor Data (Prior to first PIT Date)"

# ### REMOVED: Condition B (Invalid Final Currency Combo) entirely

# =====================================================================
# STEP 5: Split and Save
# =====================================================================
print("\nSplitting Clean vs Dropped data...")

# Clean rows (DropReason is NaN)
df_clean = merged[merged["DropReason"].isna()].copy()
df_clean.drop(columns=["DropReason"], inplace=True)

# Dropped rows
df_dropped = merged[merged["DropReason"].notna()].copy()

print(f"Saving Clean Data ({len(df_clean):,}) rows to: {merged_clean_file}")
df_clean.to_csv(merged_clean_file, sep="|", index=False)

print(f"Saving Dropped Data ({len(df_dropped):,}) rows to: {dropped_log_file}")
if not df_dropped.empty:
    df_dropped.to_csv(dropped_log_file, sep="|", index=False)
else:
    with open(dropped_log_file, "w") as f:
        f.write("|".join(merged.columns) + "\n")

# =====================================================================
# STEP 6: Final Stats
# =====================================================================
print("\n=== FINAL SUMMARY ===")
print(f"Total Processed: {len(merged):,}")
print(f"Clean Rows:      {len(df_clean):,}")
print(f"Dropped Rows:    {len(df_dropped):,}")
print(f"Unique IDs:      {df_clean['ID'].nunique():,}")

del merged, df_clean, df_dropped # removed map_df
gc.collect()

Benchmark: /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/FF_Benchmark_data_clean.txt
Factors:   /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/processed_data_Factors.txt
Output:    /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean.txt

Loading and Deduplicating Factors Data...
Converting Factor dates to datetime...
Sorting factors to resolve duplicates...
Dropping duplicates (keeping top row per ID x PIT Date)...
Unique Factor Dates (ID/Date pairs): 2,679,366

Loading Benchmark Data...
Benchmark rows: 149,520,735
Converting Benchmark dates to datetime...
Sorting Benchmark by DayDate (Required for merge_asof)...
Merging (As-Of Backward: Writing Forward)...

Applying Post-Merge Filters...

Splitting Clean vs Dropped data...
Saving Clean Data (128,221,039) rows to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean.txt
Saving Dropped Data (21,299,696) rows to: /home/jovyan/work/hpoo

0

### Get Replication Subset

In [121]:
import pandas as pd
import os

# -----------------------------------------------------------------------------
# Setup paths
# -----------------------------------------------------------------------------
# Make sure Temp_file_path_A is defined in your environment before running this.
# Example: Temp_file_path_A = '/content/drive/My Drive/Data' 

input_filename = "FF_Benchmark_Factors_Merged_Clean.txt"
input_path = f"{Temp_file_path_A}/{input_filename}"

# Create the output filename: "FF_Benchmark_Factors_Merged_Clean_Replication.txt"
# We split the extension to insert the suffix correctly.
output_path = f"{Temp_file_path_A}/FF_Benchmark_Factors_Merged_Clean_Replication.txt"

# -----------------------------------------------------------------------------
# Data Extraction
# -----------------------------------------------------------------------------
try:
    # Read the file using the pipe separator
    df = pd.read_csv(input_path, sep='|')
    
    # Filter for Country = 124 or Country = 840
    filtered_df = df[df['Country'].isin([124, 840])].copy()
    
    # Save the filtered data to the new path
    # We use sep='|' here as well to maintain the original file format
    filtered_df.to_csv(output_path, sep='|', index=False)
    
    print(f"Successfully saved filtered data to:\n{output_path}")

    # -----------------------------------------------------------------------------
    # Safety Check
    # -----------------------------------------------------------------------------
    print("\n--- Safety Check ---")
    print("Unique 'Country' values in the output file:")
    unique_countries = filtered_df['Country'].unique()
    print(unique_countries)
    
    # Validation logic
    if all(c in [124, 840] for c in unique_countries):
        print("Check Passed: Only countries 124 and 840 are present.")
    else:
        print("Check Failed: Unexpected countries found.")

except FileNotFoundError:
    print(f"Error: The file was not found at {input_path}")
except KeyError:
    print("Error: The column 'Country' was not found in the input file. Check your column names.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully saved filtered data to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean_Replication.txt

--- Safety Check ---
Unique 'Country' values in the output file:
[840 124]
Check Passed: Only countries 124 and 840 are present.


## Daily

### Ohne B/M Country

In [122]:
import numpy as np
import pandas as pd
import os
import sys
import warnings

# Suppress "All-NaN slice encountered" warnings
warnings.filterwarnings('ignore', r'All-NaN slice encountered')

# ==============================================================================
# 1. CONFIGURATION & SETUP
# ==============================================================================
INPUT_FILENAME = "FF_Benchmark_Factors_Merged_Clean.txt"  

# ### CHANGE: Updated output filename
OUTPUT_FILENAME = "Factors_Daily_Country.txt"

try:
    path_base = f"{Temp_file_path_A}"
except NameError:
    path_base = "." 

INPUT_FILE = os.path.join(path_base, INPUT_FILENAME)
OUTPUT_FILE = os.path.join(path_base, OUTPUT_FILENAME)

# Column Mapping
COLS = {
    'date': 'DayDate', 'id': 'ID', 'country': 'Country',
    'pcur': 'PCUR', 'hist_curr': 'HistCurrency',
    'ret': 'ret_bps', 
    'mv': 'MV_USD', # ### CHANGE: Switched from MV_LC to MV_USD
    'rf': 'rf',
    'be': 'be', 'op': 'op', 'inv': 'inv', 'mom': 'mom_signal',
    # Ingredients
    'rev': 'rev', 'cogs': 'cogs', 'sga': 'sga', 'int': 'int',
    'at': 'at', 'at_lag1': 'at_lag1'
}

# Global counter for progress tracking
group_counter = 0
total_groups = 0

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================
def assign_daily_portfolios(slice_df):
    global group_counter
    group_counter += 1
    if group_counter % 100 == 0 or group_counter == total_groups:
        sys.stdout.write(f"\rProcessing Group {group_counter} / {total_groups} ({(group_counter/total_groups)*100:.1f}%)")
        sys.stdout.flush()

    if len(slice_df) < 2:
        return slice_df 

    # 1. SIZE ASSIGNMENT
    slice_df = slice_df.sort_values(COLS['mv'], ascending=False)
    cum_mv_pct = slice_df[COLS['mv']].cumsum() / slice_df[COLS['mv']].sum()
    
    is_big = cum_mv_pct <= 0.90
    slice_df['Size_Port'] = np.where(is_big, 'B', 'S')
    mask_b = is_big

    # 2. STYLE ASSIGNMENTS
    def assign_style(col_name, out_col_name, labels=['L', 'N', 'H']):
        vals = slice_df[col_name].values
        big_vals = vals[mask_b]
        
        if len(big_vals[~np.isnan(big_vals)]) == 0:
            p30, p70 = np.nanpercentile(vals, 30), np.nanpercentile(vals, 70)
        else:
            p30, p70 = np.nanpercentile(big_vals, 30), np.nanpercentile(big_vals, 70)
        
        conditions = [vals <= p30, (vals > p30) & (vals < p70), vals >= p70]
        slice_df[out_col_name] = np.select(conditions, labels, default=None)

    # ### CHANGE: Removed B/M sorting block
    # if 'bm' in slice_df.columns: 
    #     assign_style('bm', 'BM_Port', ['L', 'N', 'H'])

    if 'op' in slice_df.columns: 
        assign_style('op', 'OP_Port', ['W', 'N', 'R'])
    if 'inv' in slice_df.columns: 
        assign_style('inv', 'Inv_Port', ['C', 'N', 'A'])
    if 'mom_signal' in slice_df.columns: 
        assign_style('mom_signal', 'Mom_Port', ['L', 'N', 'H'])
        
    return slice_df

def calc_vw_ret(df_sub):
    if len(df_sub) == 0: return np.nan
    val = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    return val

def pivot_portfolios(port_series, port_col_name, base_keys):
    df_p = port_series.reset_index()
    df_p['Port_Label'] = df_p['Size_Port'] + '_' + df_p[port_col_name]
    return df_p.pivot_table(index=base_keys, columns='Port_Label', values=port_series.name)

def calc_mkt_rf(df_sub):
    if len(df_sub) == 0 or df_sub[COLS['mv']].sum() == 0:
        return pd.Series({'Mkt': np.nan, 'RF': np.nan})
    mkt = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    rf_col = COLS['rf']
    if rf_col in df_sub.columns:
        rf = df_sub[rf_col].iloc[0]
    else:
        rf = 0.0
    return pd.Series({'Mkt': mkt, 'RF': rf})

# ==============================================================================
# 3. EXECUTION: LOAD & PREP
# ==============================================================================
print(f"Loading input file: {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE, sep='|', dtype={'ID': str, 'Country': str, 'PCUR': str, 'HistCurrency': str})
df[COLS['date']] = pd.to_datetime(df[COLS['date']])

if COLS['ret'] in df.columns:
    df['ret_decimal'] = df[COLS['ret']] / 10000.0
else:
    raise ValueError(f"Could not find Return column '{COLS['ret']}'")

for c in [COLS['mv'], COLS['rf'], COLS['be']]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

print("Calculating variables...")

# A) Operating Profitability (OP)
if 'op' not in df.columns:
    mandatory_op_cols = [COLS['rev'], COLS['cogs'], COLS['sga'], COLS['be']]
    if all(c in df.columns for c in mandatory_op_cols):
        rev = df[COLS['rev']]
        cogs = df[COLS['cogs']]
        sga = df[COLS['sga']]
        be = df[COLS['be']]
        if COLS['int'] in df.columns:
            interest = df[COLS['int']].fillna(0)
        else:
            interest = 0
        df['op'] = (rev - cogs - sga - interest) / be

# B) Investment (Inv)
if 'inv' not in df.columns:
    if all(c in df.columns for c in [COLS['at'], COLS['at_lag1']]):
        df['inv'] = (df[COLS['at']] - df[COLS['at_lag1']]) / df[COLS['at_lag1']]

# ### CHANGE: Removed B/M Calculation block
# if 'bm' not in df.columns: ...

# D) Momentum 
print("Calculating Momentum (Forced Recalculation)...")
df = df.sort_values([COLS['id'], COLS['pcur'], COLS['date']])
df['log_ret'] = np.log1p(df['ret_decimal'].fillna(0))
df['mom_signal'] = df.groupby([COLS['id'], COLS['pcur']])['log_ret'].transform(
    lambda x: x.rolling(window=250, min_periods=100).sum().shift(21)
)
df['mom_signal'] = np.exp(df['mom_signal']) - 1

# Prep for Assignment
subset_cols = [c for c in [COLS['mv'], 'ret_decimal'] if c in df.columns]
df_clean = df.dropna(subset=subset_cols).copy()

# ==============================================================================
# 4. EXECUTION: ASSIGN PORTFOLIOS
# ==============================================================================
print("Assigning Portfolios...")

# ### CHANGE: Removed PCUR and HistCurrency from grouping keys
group_keys = [COLS['date'], COLS['country']]

unique_groups = df_clean[group_keys].drop_duplicates()
total_groups = len(unique_groups)
print(f"Total groups to process: {total_groups}")

df_tagged = df_clean.groupby(group_keys).apply(assign_daily_portfolios)
print("\nPortfolio Assignment Done.")

# ==============================================================================
# 5. EXECUTION: AGGREGATE RETURNS
# ==============================================================================
print("Calculating Base Portfolio Returns...")

df_tagged = df_tagged.reset_index(drop=True)
# ### CHANGE: Updated base_keys to match group_keys
base_keys = [COLS['date'], COLS['country']]

# ### CHANGE: Removed Size-BM Portfolios calculation
# print("   ... Size-BM Sorts")
# port_bm = ...

# 2. Size-OP Portfolios
print("   ... Size-OP Sorts")
port_op = df_tagged.dropna(subset=['Size_Port', 'OP_Port'])\
    .groupby(base_keys + ['Size_Port', 'OP_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_op.name = 'Ret_OP'

# 3. Size-Inv Portfolios
print("   ... Size-Inv Sorts")
port_inv = df_tagged.dropna(subset=['Size_Port', 'Inv_Port'])\
    .groupby(base_keys + ['Size_Port', 'Inv_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_inv.name = 'Ret_Inv'

# 4. Size-Mom Portfolios
print("   ... Size-Mom Sorts")
port_mom = df_tagged.dropna(subset=['Size_Port', 'Mom_Port'])\
    .groupby(base_keys + ['Size_Port', 'Mom_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_mom.name = 'Ret_Mom'

print("Pivoting data...")
# ### CHANGE: Removed BM pivot
# df_p_bm = pivot_portfolios(port_bm, 'BM_Port', base_keys)
df_p_op = pivot_portfolios(port_op, 'OP_Port', base_keys)
df_p_inv = pivot_portfolios(port_inv, 'Inv_Port', base_keys)
df_p_mom = pivot_portfolios(port_mom, 'Mom_Port', base_keys)

# ==============================================================================
# 6. EXECUTION: FACTOR CONSTRUCTION
# ==============================================================================
print("Constructing Factors...")

# ### CHANGE: Use OP index instead of BM index
factors = pd.DataFrame(index=df_p_op.index)
smb_components = pd.DataFrame(index=df_p_op.index)

# ### CHANGE: Removed HML & SMB (BM) Construction
# factors['HML'] = ...

# 2. RMW & SMB (OP)
if all(c in df_p_op.columns for c in ['S_R', 'B_R', 'S_W', 'B_W']):
    factors['RMW'] = 0.5 * (df_p_op['S_R'] + df_p_op['B_R']) - \
                     0.5 * (df_p_op['S_W'] + df_p_op['B_W'])
                     
    smb_components['SMB_OP'] = (df_p_op['S_R'] + df_p_op['S_N'] + df_p_op['S_W']) / 3 - \
                               (df_p_op['B_R'] + df_p_op['B_N'] + df_p_op['B_W']) / 3
else:
    factors['RMW'] = np.nan
    smb_components['SMB_OP'] = np.nan

# 3. CMA & SMB (Inv)
if all(c in df_p_inv.columns for c in ['S_C', 'B_C', 'S_A', 'B_A']):
    factors['CMA'] = 0.5 * (df_p_inv['S_C'] + df_p_inv['B_C']) - \
                     0.5 * (df_p_inv['S_A'] + df_p_inv['B_A'])
                     
    smb_components['SMB_INV'] = (df_p_inv['S_C'] + df_p_inv['S_N'] + df_p_inv['S_A']) / 3 - \
                                (df_p_inv['B_C'] + df_p_inv['B_N'] + df_p_inv['B_A']) / 3
else:
    factors['CMA'] = np.nan
    smb_components['SMB_INV'] = np.nan

# 4. MOM
if all(c in df_p_mom.columns for c in ['S_H', 'B_H', 'S_L', 'B_L']):
    factors['Mom'] = 0.5 * (df_p_mom['S_H'] + df_p_mom['B_H']) - \
                     0.5 * (df_p_mom['S_L'] + df_p_mom['B_L'])
else:
    factors['Mom'] = np.nan

# 5. CONSOLIDATE SMB
# ### CHANGE: Updated to only average OP and INV (excluded BM)
factors['SMB'] = smb_components[['SMB_OP', 'SMB_INV']].mean(axis=1)

# 6. MARKET RETURN
print("   ... Calculating Market Return")
mkt_info = df_tagged.groupby(base_keys).apply(calc_mkt_rf, include_groups=False)

factors = factors.join(mkt_info)
factors['Mkt-RF'] = factors['Mkt'] - factors['RF']

# Finalize structure
factors_final = factors.reset_index()

# ==============================================================================
# 7. OUTPUT & VERIFICATION
# ==============================================================================
print(f"Saving Final Factors to TXT: {OUTPUT_FILE}")
factors_final.to_csv(OUTPUT_FILE, sep='|', index=False)

print("-" * 80)
print("VERIFICATION CHECK")
print("-" * 80)

# Filter for the specific check (Reduced filters since PCUR/HistCurrency are gone)
mask_ver = (
    (factors_final[COLS['date']] >= '2000-07-31') &
    (factors_final[COLS['date']] <= '2000-08-09') &
    (factors_final['Country'] == '280') 
)

verification_subset = factors_final[mask_ver]
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(verification_subset)

print("\nProcess Complete.")

Loading input file: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean.txt...
Calculating variables...
Calculating Momentum (Forced Recalculation)...


  result = getattr(ufunc, method)(*inputs, **kwargs)


Assigning Portfolios...
Total groups to process: 426248
Processing Group 426248 / 426248 (100.0%)

  df_tagged = df_clean.groupby(group_keys).apply(assign_daily_portfolios)



Portfolio Assignment Done.
Calculating Base Portfolio Returns...
   ... Size-OP Sorts
   ... Size-Inv Sorts
   ... Size-Mom Sorts
Pivoting data...
Constructing Factors...
   ... Calculating Market Return
Saving Final Factors to TXT: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/Factors_Daily_Country.txt
--------------------------------------------------------------------------------
VERIFICATION CHECK
--------------------------------------------------------------------------------
         DayDate Country       RMW       CMA       Mom       SMB       Mkt      RF    Mkt-RF
74356 2000-07-31     280 -0.008872 -0.019262 -0.001383 -0.017977  0.014813  0.0002  0.014613
74401 2000-08-01     280 -0.007279 -0.001813  0.001504  0.002332 -0.005353  0.0002 -0.005553
74446 2000-08-02     280 -0.011713  0.005775 -0.010608  0.003091 -0.017643  0.0002 -0.017843
74491 2000-08-03     280  0.001477  0.018914 -0.025176  0.004143 -0.024798  0.0002 -0.024998
74536 2000-08-04     280  0.005786  0

### Ohne B/M Global

In [123]:
import numpy as np
import pandas as pd
import os
import sys
import warnings

# Suppress "All-NaN slice encountered" warnings
warnings.filterwarnings('ignore', r'All-NaN slice encountered')

# ==============================================================================
# 1. CONFIGURATION & SETUP
# ==============================================================================
INPUT_FILENAME = "FF_Benchmark_Factors_Merged_Clean.txt"  

# ### CHANGE: Filename for Global version
OUTPUT_FILENAME = "Factors_Daily_Global.txt"

try:
    path_base = f"{Temp_file_path_A}"
except NameError:
    path_base = "." 

INPUT_FILE = os.path.join(path_base, INPUT_FILENAME)
OUTPUT_FILE = os.path.join(path_base, OUTPUT_FILENAME)

# Column Mapping
COLS = {
    'date': 'DayDate', 'id': 'ID', 'country': 'Country',
    'pcur': 'PCUR', 'hist_curr': 'HistCurrency',
    'ret': 'ret_bps', 
    'mv': 'MV_USD', # Keeping MV_USD as requested previously
    'rf': 'rf',
    'be': 'be', 'op': 'op', 'inv': 'inv', 'mom': 'mom_signal',
    'rev': 'rev', 'cogs': 'cogs', 'sga': 'sga', 'int': 'int',
    'at': 'at', 'at_lag1': 'at_lag1'
}

group_counter = 0
total_groups = 0

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================
def assign_daily_portfolios(slice_df):
    global group_counter
    group_counter += 1
    if group_counter % 100 == 0 or group_counter == total_groups:
        sys.stdout.write(f"\rProcessing Group {group_counter} / {total_groups} ({(group_counter/total_groups)*100:.1f}%)")
        sys.stdout.flush()

    if len(slice_df) < 2:
        return slice_df 

    # 1. SIZE ASSIGNMENT (Global Sort)
    slice_df = slice_df.sort_values(COLS['mv'], ascending=False)
    cum_mv_pct = slice_df[COLS['mv']].cumsum() / slice_df[COLS['mv']].sum()
    
    is_big = cum_mv_pct <= 0.90
    slice_df['Size_Port'] = np.where(is_big, 'B', 'S')
    mask_b = is_big

    # 2. STYLE ASSIGNMENTS
    def assign_style(col_name, out_col_name, labels=['L', 'N', 'H']):
        vals = slice_df[col_name].values
        big_vals = vals[mask_b]
        
        if len(big_vals[~np.isnan(big_vals)]) == 0:
            p30, p70 = np.nanpercentile(vals, 30), np.nanpercentile(vals, 70)
        else:
            p30, p70 = np.nanpercentile(big_vals, 30), np.nanpercentile(big_vals, 70)
        
        conditions = [vals <= p30, (vals > p30) & (vals < p70), vals >= p70]
        slice_df[out_col_name] = np.select(conditions, labels, default=None)

    # (B/M Removed)
    if 'op' in slice_df.columns: 
        assign_style('op', 'OP_Port', ['W', 'N', 'R'])
    if 'inv' in slice_df.columns: 
        assign_style('inv', 'Inv_Port', ['C', 'N', 'A'])
    if 'mom_signal' in slice_df.columns: 
        assign_style('mom_signal', 'Mom_Port', ['L', 'N', 'H'])
        
    return slice_df

def calc_vw_ret(df_sub):
    if len(df_sub) == 0: return np.nan
    val = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    return val

def pivot_portfolios(port_series, port_col_name, base_keys):
    df_p = port_series.reset_index()
    df_p['Port_Label'] = df_p['Size_Port'] + '_' + df_p[port_col_name]
    return df_p.pivot_table(index=base_keys, columns='Port_Label', values=port_series.name)

def calc_mkt_rf(df_sub):
    if len(df_sub) == 0 or df_sub[COLS['mv']].sum() == 0:
        return pd.Series({'Mkt': np.nan, 'RF': np.nan})
    mkt = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    rf_col = COLS['rf']
    if rf_col in df_sub.columns:
        rf = df_sub[rf_col].iloc[0]
    else:
        rf = 0.0
    return pd.Series({'Mkt': mkt, 'RF': rf})

# ==============================================================================
# 3. EXECUTION: LOAD & PREP
# ==============================================================================
print(f"Loading input file: {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE, sep='|', dtype={'ID': str, 'Country': str, 'PCUR': str, 'HistCurrency': str})
df[COLS['date']] = pd.to_datetime(df[COLS['date']])

if COLS['ret'] in df.columns:
    df['ret_decimal'] = df[COLS['ret']] / 10000.0
else:
    raise ValueError(f"Could not find Return column '{COLS['ret']}'")

for c in [COLS['mv'], COLS['rf'], COLS['be']]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

print("Calculating variables...")
# A) OP
if 'op' not in df.columns:
    mandatory_op_cols = [COLS['rev'], COLS['cogs'], COLS['sga'], COLS['be']]
    if all(c in df.columns for c in mandatory_op_cols):
        rev = df[COLS['rev']]
        cogs = df[COLS['cogs']]
        sga = df[COLS['sga']]
        be = df[COLS['be']]
        interest = df[COLS['int']].fillna(0) if COLS['int'] in df.columns else 0
        df['op'] = (rev - cogs - sga - interest) / be

# B) Inv
if 'inv' not in df.columns:
    if all(c in df.columns for c in [COLS['at'], COLS['at_lag1']]):
        df['inv'] = (df[COLS['at']] - df[COLS['at_lag1']]) / df[COLS['at_lag1']]

# D) Momentum 
print("Calculating Momentum (Forced Recalculation)...")
df = df.sort_values([COLS['id'], COLS['pcur'], COLS['date']])
df['log_ret'] = np.log1p(df['ret_decimal'].fillna(0))
df['mom_signal'] = df.groupby([COLS['id'], COLS['pcur']])['log_ret'].transform(
    lambda x: x.rolling(window=250, min_periods=100).sum().shift(21)
)
df['mom_signal'] = np.exp(df['mom_signal']) - 1

# Prep
subset_cols = [c for c in [COLS['mv'], 'ret_decimal'] if c in df.columns]
df_clean = df.dropna(subset=subset_cols).copy()

# ==============================================================================
# 4. EXECUTION: ASSIGN PORTFOLIOS (GLOBAL)
# ==============================================================================
print("Assigning Portfolios...")

# ### CHANGE: Group ONLY by date (Global sort)
group_keys = [COLS['date']]

unique_groups = df_clean[group_keys].drop_duplicates()
total_groups = len(unique_groups)
print(f"Total groups to process: {total_groups}")

df_tagged = df_clean.groupby(group_keys).apply(assign_daily_portfolios)
print("\nPortfolio Assignment Done.")

# ==============================================================================
# 5. EXECUTION: AGGREGATE RETURNS (GLOBAL)
# ==============================================================================
print("Calculating Base Portfolio Returns...")

df_tagged = df_tagged.reset_index(drop=True)
# ### CHANGE: Base keys are ONLY date
base_keys = [COLS['date']]

# 2. Size-OP
print("   ... Size-OP Sorts")
port_op = df_tagged.dropna(subset=['Size_Port', 'OP_Port'])\
    .groupby(base_keys + ['Size_Port', 'OP_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_op.name = 'Ret_OP'

# 3. Size-Inv
print("   ... Size-Inv Sorts")
port_inv = df_tagged.dropna(subset=['Size_Port', 'Inv_Port'])\
    .groupby(base_keys + ['Size_Port', 'Inv_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_inv.name = 'Ret_Inv'

# 4. Size-Mom
print("   ... Size-Mom Sorts")
port_mom = df_tagged.dropna(subset=['Size_Port', 'Mom_Port'])\
    .groupby(base_keys + ['Size_Port', 'Mom_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_mom.name = 'Ret_Mom'

print("Pivoting data...")
df_p_op = pivot_portfolios(port_op, 'OP_Port', base_keys)
df_p_inv = pivot_portfolios(port_inv, 'Inv_Port', base_keys)
df_p_mom = pivot_portfolios(port_mom, 'Mom_Port', base_keys)

# ==============================================================================
# 6. EXECUTION: FACTOR CONSTRUCTION
# ==============================================================================
print("Constructing Factors...")

factors = pd.DataFrame(index=df_p_op.index)
smb_components = pd.DataFrame(index=df_p_op.index)

# 2. RMW & SMB (OP)
if all(c in df_p_op.columns for c in ['S_R', 'B_R', 'S_W', 'B_W']):
    factors['RMW'] = 0.5 * (df_p_op['S_R'] + df_p_op['B_R']) - \
                     0.5 * (df_p_op['S_W'] + df_p_op['B_W'])
    smb_components['SMB_OP'] = (df_p_op['S_R'] + df_p_op['S_N'] + df_p_op['S_W']) / 3 - \
                               (df_p_op['B_R'] + df_p_op['B_N'] + df_p_op['B_W']) / 3
else:
    factors['RMW'] = np.nan
    smb_components['SMB_OP'] = np.nan

# 3. CMA & SMB (Inv)
if all(c in df_p_inv.columns for c in ['S_C', 'B_C', 'S_A', 'B_A']):
    factors['CMA'] = 0.5 * (df_p_inv['S_C'] + df_p_inv['B_C']) - \
                     0.5 * (df_p_inv['S_A'] + df_p_inv['B_A'])
    smb_components['SMB_INV'] = (df_p_inv['S_C'] + df_p_inv['S_N'] + df_p_inv['S_A']) / 3 - \
                                (df_p_inv['B_C'] + df_p_inv['B_N'] + df_p_inv['B_A']) / 3
else:
    factors['CMA'] = np.nan
    smb_components['SMB_INV'] = np.nan

# 4. MOM
if all(c in df_p_mom.columns for c in ['S_H', 'B_H', 'S_L', 'B_L']):
    factors['Mom'] = 0.5 * (df_p_mom['S_H'] + df_p_mom['B_H']) - \
                     0.5 * (df_p_mom['S_L'] + df_p_mom['B_L'])
else:
    factors['Mom'] = np.nan

# 5. CONSOLIDATE SMB
factors['SMB'] = smb_components[['SMB_OP', 'SMB_INV']].mean(axis=1)

# 6. MARKET RETURN (GLOBAL)
print("   ... Calculating Global Market Return")
mkt_info = df_tagged.groupby(base_keys).apply(calc_mkt_rf, include_groups=False)

factors = factors.join(mkt_info)
factors['Mkt-RF'] = factors['Mkt'] - factors['RF']

factors_final = factors.reset_index()

# ==============================================================================
# 7. OUTPUT & VERIFICATION
# ==============================================================================
print(f"Saving Final Factors to TXT: {OUTPUT_FILE}")
factors_final.to_csv(OUTPUT_FILE, sep='|', index=False)

print("-" * 80)
print("VERIFICATION CHECK (Global)")
print("-" * 80)

# ### CHANGE: Removed Country filter for verification
mask_ver = (
    (factors_final[COLS['date']] >= '2000-07-31') &
    (factors_final[COLS['date']] <= '2000-08-09')
)

verification_subset = factors_final[mask_ver]
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(verification_subset)

print("\nProcess Complete.")

Loading input file: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean.txt...
Calculating variables...
Calculating Momentum (Forced Recalculation)...


  result = getattr(ufunc, method)(*inputs, **kwargs)


Assigning Portfolios...
Total groups to process: 8465
Processing Group 8465 / 8465 (100.0%)

  df_tagged = df_clean.groupby(group_keys).apply(assign_daily_portfolios)



Portfolio Assignment Done.
Calculating Base Portfolio Returns...
   ... Size-OP Sorts
   ... Size-Inv Sorts
   ... Size-Mom Sorts
Pivoting data...
Constructing Factors...
   ... Calculating Global Market Return
Saving Final Factors to TXT: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/Factors_Daily_Global.txt
--------------------------------------------------------------------------------
VERIFICATION CHECK (Global)
--------------------------------------------------------------------------------
        DayDate       RMW       CMA       Mom       SMB       Mkt      RF    Mkt-RF
2061 2000-07-31 -0.007841 -0.006880  0.011728 -0.011425  0.007171  0.0002  0.006971
2062 2000-08-01  0.000808  0.012878 -0.009002  0.004667  0.003369  0.0002  0.003169
2063 2000-08-02  0.001155  0.006207 -0.008191  0.001787 -0.000906  0.0002 -0.001106
2064 2000-08-03  0.002361 -0.001242 -0.002767  0.000777 -0.001763  0.0002 -0.001963
2065 2000-08-04 -0.008170 -0.010783  0.007716 -0.005725  0.005312  

### Ohne B/M Replication

In [124]:
import numpy as np
import pandas as pd
import os
import sys
import warnings

# Suppress "All-NaN slice encountered" warnings
warnings.filterwarnings('ignore', r'All-NaN slice encountered')

# ==============================================================================
# 1. CONFIGURATION & SETUP
# ==============================================================================
INPUT_FILENAME = "FF_Benchmark_Factors_Merged_Clean_Replication.txt"  

# ### CHANGE: Filename for Replication version
OUTPUT_FILENAME = "Factors_Daily_Replication.txt"

try:
    path_base = f"{Temp_file_path_A}"
except NameError:
    path_base = "." 

INPUT_FILE = os.path.join(path_base, INPUT_FILENAME)
OUTPUT_FILE = os.path.join(path_base, OUTPUT_FILENAME)

# Column Mapping
COLS = {
    'date': 'DayDate', 'id': 'ID', 'country': 'Country',
    'pcur': 'PCUR', 'hist_curr': 'HistCurrency',
    'ret': 'ret_bps', 
    'mv': 'MV_USD', # Keeping MV_USD as requested previously
    'rf': 'rf',
    'be': 'be', 'op': 'op', 'inv': 'inv', 'mom': 'mom_signal',
    'rev': 'rev', 'cogs': 'cogs', 'sga': 'sga', 'int': 'int',
    'at': 'at', 'at_lag1': 'at_lag1'
}

group_counter = 0
total_groups = 0

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================
def assign_daily_portfolios(slice_df):
    global group_counter
    group_counter += 1
    if group_counter % 100 == 0 or group_counter == total_groups:
        sys.stdout.write(f"\rProcessing Group {group_counter} / {total_groups} ({(group_counter/total_groups)*100:.1f}%)")
        sys.stdout.flush()

    if len(slice_df) < 2:
        return slice_df 

    # 1. SIZE ASSIGNMENT (Global Sort)
    slice_df = slice_df.sort_values(COLS['mv'], ascending=False)
    cum_mv_pct = slice_df[COLS['mv']].cumsum() / slice_df[COLS['mv']].sum()
    
    is_big = cum_mv_pct <= 0.90
    slice_df['Size_Port'] = np.where(is_big, 'B', 'S')
    mask_b = is_big

    # 2. STYLE ASSIGNMENTS
    def assign_style(col_name, out_col_name, labels=['L', 'N', 'H']):
        vals = slice_df[col_name].values
        big_vals = vals[mask_b]
        
        if len(big_vals[~np.isnan(big_vals)]) == 0:
            p30, p70 = np.nanpercentile(vals, 30), np.nanpercentile(vals, 70)
        else:
            p30, p70 = np.nanpercentile(big_vals, 30), np.nanpercentile(big_vals, 70)
        
        conditions = [vals <= p30, (vals > p30) & (vals < p70), vals >= p70]
        slice_df[out_col_name] = np.select(conditions, labels, default=None)

    # (B/M Removed)
    if 'op' in slice_df.columns: 
        assign_style('op', 'OP_Port', ['W', 'N', 'R'])
    if 'inv' in slice_df.columns: 
        assign_style('inv', 'Inv_Port', ['C', 'N', 'A'])
    if 'mom_signal' in slice_df.columns: 
        assign_style('mom_signal', 'Mom_Port', ['L', 'N', 'H'])
        
    return slice_df

def calc_vw_ret(df_sub):
    if len(df_sub) == 0: return np.nan
    val = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    return val

def pivot_portfolios(port_series, port_col_name, base_keys):
    df_p = port_series.reset_index()
    df_p['Port_Label'] = df_p['Size_Port'] + '_' + df_p[port_col_name]
    return df_p.pivot_table(index=base_keys, columns='Port_Label', values=port_series.name)

def calc_mkt_rf(df_sub):
    if len(df_sub) == 0 or df_sub[COLS['mv']].sum() == 0:
        return pd.Series({'Mkt': np.nan, 'RF': np.nan})
    mkt = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    rf_col = COLS['rf']
    if rf_col in df_sub.columns:
        rf = df_sub[rf_col].iloc[0]
    else:
        rf = 0.0
    return pd.Series({'Mkt': mkt, 'RF': rf})

# ==============================================================================
# 3. EXECUTION: LOAD & PREP
# ==============================================================================
print(f"Loading input file: {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE, sep='|', dtype={'ID': str, 'Country': str, 'PCUR': str, 'HistCurrency': str})
df[COLS['date']] = pd.to_datetime(df[COLS['date']])

if COLS['ret'] in df.columns:
    df['ret_decimal'] = df[COLS['ret']] / 10000.0
else:
    raise ValueError(f"Could not find Return column '{COLS['ret']}'")

for c in [COLS['mv'], COLS['rf'], COLS['be']]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

print("Calculating variables...")
# A) OP
if 'op' not in df.columns:
    mandatory_op_cols = [COLS['rev'], COLS['cogs'], COLS['sga'], COLS['be']]
    if all(c in df.columns for c in mandatory_op_cols):
        rev = df[COLS['rev']]
        cogs = df[COLS['cogs']]
        sga = df[COLS['sga']]
        be = df[COLS['be']]
        interest = df[COLS['int']].fillna(0) if COLS['int'] in df.columns else 0
        df['op'] = (rev - cogs - sga - interest) / be

# B) Inv
if 'inv' not in df.columns:
    if all(c in df.columns for c in [COLS['at'], COLS['at_lag1']]):
        df['inv'] = (df[COLS['at']] - df[COLS['at_lag1']]) / df[COLS['at_lag1']]

# D) Momentum 
print("Calculating Momentum (Forced Recalculation)...")
df = df.sort_values([COLS['id'], COLS['pcur'], COLS['date']])
df['log_ret'] = np.log1p(df['ret_decimal'].fillna(0))
df['mom_signal'] = df.groupby([COLS['id'], COLS['pcur']])['log_ret'].transform(
    lambda x: x.rolling(window=250, min_periods=100).sum().shift(21)
)
df['mom_signal'] = np.exp(df['mom_signal']) - 1

# Prep
subset_cols = [c for c in [COLS['mv'], 'ret_decimal'] if c in df.columns]
df_clean = df.dropna(subset=subset_cols).copy()

# ==============================================================================
# 4. EXECUTION: ASSIGN PORTFOLIOS (GLOBAL)
# ==============================================================================
print("Assigning Portfolios...")

# ### CHANGE: Group ONLY by date (Global sort)
group_keys = [COLS['date']]

unique_groups = df_clean[group_keys].drop_duplicates()
total_groups = len(unique_groups)
print(f"Total groups to process: {total_groups}")

df_tagged = df_clean.groupby(group_keys).apply(assign_daily_portfolios)
print("\nPortfolio Assignment Done.")

# ==============================================================================
# 5. EXECUTION: AGGREGATE RETURNS (GLOBAL)
# ==============================================================================
print("Calculating Base Portfolio Returns...")

df_tagged = df_tagged.reset_index(drop=True)
# ### CHANGE: Base keys are ONLY date
base_keys = [COLS['date']]

# 2. Size-OP
print("   ... Size-OP Sorts")
port_op = df_tagged.dropna(subset=['Size_Port', 'OP_Port'])\
    .groupby(base_keys + ['Size_Port', 'OP_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_op.name = 'Ret_OP'

# 3. Size-Inv
print("   ... Size-Inv Sorts")
port_inv = df_tagged.dropna(subset=['Size_Port', 'Inv_Port'])\
    .groupby(base_keys + ['Size_Port', 'Inv_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_inv.name = 'Ret_Inv'

# 4. Size-Mom
print("   ... Size-Mom Sorts")
port_mom = df_tagged.dropna(subset=['Size_Port', 'Mom_Port'])\
    .groupby(base_keys + ['Size_Port', 'Mom_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_mom.name = 'Ret_Mom'

print("Pivoting data...")
df_p_op = pivot_portfolios(port_op, 'OP_Port', base_keys)
df_p_inv = pivot_portfolios(port_inv, 'Inv_Port', base_keys)
df_p_mom = pivot_portfolios(port_mom, 'Mom_Port', base_keys)

# ==============================================================================
# 6. EXECUTION: FACTOR CONSTRUCTION
# ==============================================================================
print("Constructing Factors...")

factors = pd.DataFrame(index=df_p_op.index)
smb_components = pd.DataFrame(index=df_p_op.index)

# 2. RMW & SMB (OP)
if all(c in df_p_op.columns for c in ['S_R', 'B_R', 'S_W', 'B_W']):
    factors['RMW'] = 0.5 * (df_p_op['S_R'] + df_p_op['B_R']) - \
                     0.5 * (df_p_op['S_W'] + df_p_op['B_W'])
    smb_components['SMB_OP'] = (df_p_op['S_R'] + df_p_op['S_N'] + df_p_op['S_W']) / 3 - \
                               (df_p_op['B_R'] + df_p_op['B_N'] + df_p_op['B_W']) / 3
else:
    factors['RMW'] = np.nan
    smb_components['SMB_OP'] = np.nan

# 3. CMA & SMB (Inv)
if all(c in df_p_inv.columns for c in ['S_C', 'B_C', 'S_A', 'B_A']):
    factors['CMA'] = 0.5 * (df_p_inv['S_C'] + df_p_inv['B_C']) - \
                     0.5 * (df_p_inv['S_A'] + df_p_inv['B_A'])
    smb_components['SMB_INV'] = (df_p_inv['S_C'] + df_p_inv['S_N'] + df_p_inv['S_A']) / 3 - \
                                (df_p_inv['B_C'] + df_p_inv['B_N'] + df_p_inv['B_A']) / 3
else:
    factors['CMA'] = np.nan
    smb_components['SMB_INV'] = np.nan

# 4. MOM
if all(c in df_p_mom.columns for c in ['S_H', 'B_H', 'S_L', 'B_L']):
    factors['Mom'] = 0.5 * (df_p_mom['S_H'] + df_p_mom['B_H']) - \
                     0.5 * (df_p_mom['S_L'] + df_p_mom['B_L'])
else:
    factors['Mom'] = np.nan

# 5. CONSOLIDATE SMB
factors['SMB'] = smb_components[['SMB_OP', 'SMB_INV']].mean(axis=1)

# 6. MARKET RETURN (GLOBAL)
print("   ... Calculating Global Market Return")
mkt_info = df_tagged.groupby(base_keys).apply(calc_mkt_rf, include_groups=False)

factors = factors.join(mkt_info)
factors['Mkt-RF'] = factors['Mkt'] - factors['RF']

factors_final = factors.reset_index()

# ==============================================================================
# 7. OUTPUT & VERIFICATION
# ==============================================================================
print(f"Saving Final Factors to TXT: {OUTPUT_FILE}")
factors_final.to_csv(OUTPUT_FILE, sep='|', index=False)

print("-" * 80)
print("VERIFICATION CHECK (Global)")
print("-" * 80)

# ### CHANGE: Removed Country filter for verification
mask_ver = (
    (factors_final[COLS['date']] >= '2000-07-31') &
    (factors_final[COLS['date']] <= '2000-08-09')
)

verification_subset = factors_final[mask_ver]
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(verification_subset)

print("\nProcess Complete.")

Loading input file: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean_Replication.txt...
Calculating variables...
Calculating Momentum (Forced Recalculation)...


  result = getattr(ufunc, method)(*inputs, **kwargs)


Assigning Portfolios...
Total groups to process: 8465
Processing Group 8465 / 8465 (100.0%)

  df_tagged = df_clean.groupby(group_keys).apply(assign_daily_portfolios)



Portfolio Assignment Done.
Calculating Base Portfolio Returns...
   ... Size-OP Sorts
   ... Size-Inv Sorts
   ... Size-Mom Sorts
Pivoting data...
Constructing Factors...
   ... Calculating Global Market Return
Saving Final Factors to TXT: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/Factors_Daily_Replication.txt
--------------------------------------------------------------------------------
VERIFICATION CHECK (Global)
--------------------------------------------------------------------------------
        DayDate       RMW       CMA       Mom       SMB       Mkt      RF    Mkt-RF
2061 2000-07-31  0.005752 -0.014189  0.017705 -0.004491  0.010342  0.0002  0.010142
2062 2000-08-01  0.009035  0.011346 -0.020302 -0.002796  0.001457  0.0002  0.001257
2063 2000-08-02 -0.008886  0.003379 -0.012408  0.008025  0.002911  0.0002  0.002711
2064 2000-08-03  0.014573 -0.017572  0.011433 -0.012990  0.010520  0.0002  0.010320
2065 2000-08-04 -0.010248 -0.003808  0.013296 -0.006872  0.004

## Annual

### Ohne B/M Country

In [125]:
import numpy as np
import pandas as pd
import os
import sys
import warnings
import gc

# Suppress "All-NaN slice encountered" and other noisy warnings
warnings.filterwarnings('ignore')

# ==============================================================================
# 1. CONFIGURATION & SETUP
# ==============================================================================
INPUT_FILENAME = "FF_Benchmark_Factors_Merged_Clean.txt"  

# ### CHANGE: Updated Filename
OUTPUT_FILENAME = "Factors_Annual_Country.txt"

# Define Paths
try:
    path_base = f"{Temp_file_path_A}"
except NameError:
    path_base = "." 

INPUT_FILE = os.path.join(path_base, INPUT_FILENAME)
OUTPUT_FILE = os.path.join(path_base, OUTPUT_FILENAME)

# Column Mapping
COLS = {
    'date': 'DayDate', 'id': 'ID', 'country': 'Country',
    'pcur': 'PCUR', 'hist_curr': 'HistCurrency',
    'ret': 'ret_bps', 
    'mv': 'MV_USD', # ### CHANGE: Switched to MV_USD
    'rf': 'rf',
    'be': 'be', 'op': 'op', 'inv': 'inv', 'mom': 'mom_signal',
    # Ingredients
    'rev': 'rev', 'cogs': 'cogs', 'sga': 'sga', 'int': 'int',
    'at': 'at', 'at_lag1': 'at_lag1'
}

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================
def assign_portfolios_robust(slice_df):
    """
    Assigns Size and Style groups. Handles missing columns gracefully.
    """
    # ### CHANGE: Removed BM_Port from expected columns
    expected_cols = ['Size_Port', 'OP_Port', 'Inv_Port', 'Mom_Port']
    for c in expected_cols:
        if c not in slice_df.columns:
            slice_df[c] = np.nan

    if len(slice_df) < 2:
        return slice_df 

    # 1. SIZE ASSIGNMENT (Top 90% = Big)
    if slice_df[COLS['mv']].isna().all():
        return slice_df
        
    slice_df = slice_df.sort_values(COLS['mv'], ascending=False)
    cum_mv_pct = slice_df[COLS['mv']].cumsum() / slice_df[COLS['mv']].sum()
    
    is_big = cum_mv_pct <= 0.90
    slice_df['Size_Port'] = np.where(is_big, 'B', 'S')
    mask_b = is_big

    # 2. STYLE ASSIGNMENTS
    def assign_style(col_name, out_col_name, labels=['L', 'N', 'H']):
        vals = slice_df[col_name].values
        big_vals = vals[mask_b]
        
        valid_big = big_vals[~np.isnan(big_vals)]
        
        if len(valid_big) == 0:
             # Fallback
             valid_all = vals[~np.isnan(vals)]
             if len(valid_all) == 0:
                 slice_df[out_col_name] = None
                 return
             p30, p70 = np.percentile(valid_all, [30, 70])
        else:
             p30, p70 = np.percentile(valid_big, [30, 70])
        
        conditions = [vals <= p30, (vals > p30) & (vals < p70), vals >= p70]
        slice_df[out_col_name] = np.select(conditions, labels, default=None)

    # ### CHANGE: Commented out BM assignment
    # if 'bm' in slice_df.columns: assign_style('bm', 'BM_Port')
    
    if 'op' in slice_df.columns: assign_style('op', 'OP_Port', ['W', 'N', 'R'])
    if 'inv' in slice_df.columns: assign_style('inv', 'Inv_Port', ['C', 'N', 'A'])
    if 'mom_signal' in slice_df.columns: assign_style('mom_signal', 'Mom_Port')
        
    return slice_df

def calc_vw_ret(df_sub):
    if len(df_sub) == 0: return np.nan
    val = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    return val

def pivot_portfolios(port_series, port_col_name, base_keys):
    if port_series.empty:
        return pd.DataFrame()

    df_p = port_series.reset_index()
    
    # CAST TO STRING TO PREVENT CRASH
    s_port = df_p['Size_Port'].astype(str)
    style_port = df_p[port_col_name].astype(str)
    
    df_p['Port_Label'] = s_port + '_' + style_port
    
    return df_p.pivot_table(index=base_keys, columns='Port_Label', values=port_series.name)

def calc_mkt_rf(df_sub):
    if len(df_sub) == 0 or df_sub[COLS['mv']].sum() == 0:
        return pd.Series({'Mkt': np.nan, 'RF': np.nan})
    mkt = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    
    rf_col = COLS['rf']
    if rf_col in df_sub.columns:
        rf = df_sub[rf_col].iloc[0]
    else:
        rf = 0.0
    return pd.Series({'Mkt': mkt, 'RF': rf})

# ==============================================================================
# 3. EXECUTION: LOAD & PREP
# ==============================================================================
print(f"Loading input file: {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE, sep='|', dtype={'ID': str, 'Country': str, 'PCUR': str, 'HistCurrency': str})

# Standardize Dates
df[COLS['date']] = pd.to_datetime(df[COLS['date']])

# Convert Returns: BPS -> Decimal
if COLS['ret'] in df.columns:
    df['ret_decimal'] = df[COLS['ret']] / 10000.0
else:
    raise ValueError(f"Could not find Return column '{COLS['ret']}'. Check COLS mapping.")

# Prep numeric columns
for c in [COLS['mv'], COLS['rf'], COLS['be']]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

# -----------------------------------------------------------------------------
# CALCULATE VARIABLES
# -----------------------------------------------------------------------------
print("Calculating variables...")

# A) Operating Profitability (OP)
if 'op' not in df.columns:
    mandatory_op_cols = [COLS['rev'], COLS['cogs'], COLS['sga'], COLS['be']]
    if all(c in df.columns for c in mandatory_op_cols):
        rev = df[COLS['rev']]
        cogs = df[COLS['cogs']]
        sga = df[COLS['sga']]
        be = df[COLS['be']]
        interest = df[COLS['int']].fillna(0) if COLS['int'] in df.columns else 0
        df['op'] = (rev - cogs - sga - interest) / be

# B) Investment (Inv)
if 'inv' not in df.columns:
    if all(c in df.columns for c in [COLS['at'], COLS['at_lag1']]):
        df['inv'] = (df[COLS['at']] - df[COLS['at_lag1']]) / df[COLS['at_lag1']]

# ### CHANGE: Commented out B/M Calculation
# # C) Book-to-Market (BM)
# if 'bm' not in df.columns:
#     if COLS['be'] in df.columns and COLS['mv'] in df.columns:
#         df['bm'] = df[COLS['be']] / df[COLS['mv']]

# D) Momentum (Forced Recalc)
print("Calculating Momentum...")
df = df.sort_values([COLS['id'], COLS['pcur'], COLS['date']])
df['log_ret'] = np.log1p(df['ret_decimal'].fillna(0))
df['mom_signal'] = df.groupby([COLS['id'], COLS['pcur']])['log_ret'].transform(
    lambda x: x.rolling(window=250, min_periods=100).sum().shift(21)
)
df['mom_signal'] = np.exp(df['mom_signal']) - 1

# ==============================================================================
# 4. EXECUTION: ANNUAL PORTFOLIO ASSIGNMENT
# ==============================================================================
print("\n=== STEP 4: ANNUAL REBALANCING LOGIC ===")

# 4A. DEFINE "PORTFOLIO YEAR"
print("Calculing Portfolio Year for Daily Data...")
df['Port_Year'] = np.where(df[COLS['date']].dt.month >= 7, 
                           df[COLS['date']].dt.year, 
                           df[COLS['date']].dt.year - 1).astype(int)

# 4B. FIND ANCHOR DATA
print("Identifying Anchor Dates (Best data near June 30th)...")
candidates = df[df[COLS['date']].dt.month.isin([6, 7])].copy()
candidates['Anchor_Year'] = candidates[COLS['date']].dt.year
candidates['Target_Date'] = pd.to_datetime(candidates['Anchor_Year'].astype(str) + "-06-30")
candidates['Diff'] = (candidates[COLS['date']] - candidates['Target_Date']).abs()
candidates = candidates.sort_values([COLS['id'], 'Anchor_Year', 'Diff'])
df_anchors = candidates.groupby([COLS['id'], 'Anchor_Year'], as_index=False).first()

print(f"Identified {len(df_anchors)} anchor rows.")
del candidates
gc.collect()

# 4C. ASSIGN PORTFOLIOS ON ANCHOR DATA
print("Assigning Portfolios on Anchor Dates...")

# ### CHANGE: Removed Currency columns from grouping keys.
# Now grouping only by Country + Year
group_keys = [COLS['country'], 'Anchor_Year']

df_anchors_clean = df_anchors.dropna(subset=[COLS['mv']]).copy()
print(f"Rows with valid Market Cap: {len(df_anchors_clean)}")

if df_anchors_clean.empty:
    raise ValueError("CRITICAL: No valid anchor data (Market Cap is NaN).")

# Run Assignment
df_tagged_anchors = df_anchors_clean.groupby(group_keys, group_keys=False).apply(assign_portfolios_robust)

print("Annual Assignment Done.")

# 4D. THE MERGE (ROLL FORWARD)
print("\nRolling Forward: Merging Anchors to Daily Data...")

# ### CHANGE: Removed BM_Port from cols_to_keep
cols_to_keep = [COLS['id'], 'Anchor_Year', 'Size_Port', 'OP_Port', 'Inv_Port', 'Mom_Port']
cols_to_keep = [c for c in cols_to_keep if c in df_tagged_anchors.columns]

df_merge_right = df_tagged_anchors[cols_to_keep].copy()
df_merge_right = df_merge_right.rename(columns={'Anchor_Year': 'Port_Year'})

# Perform Merge
df_tagged = pd.merge(df, df_merge_right, on=[COLS['id'], 'Port_Year'], how='left')

# Check results
n_total = len(df_tagged)
n_tagged = df_tagged['Size_Port'].notna().sum()
print(f"Merge Complete.")
print(f"  Total Daily Rows: {n_total}")
print(f"  Tagged Daily Rows: {n_tagged} ({(n_tagged/n_total)*100:.1f}%)")

df_tagged = df_tagged.dropna(subset=['Size_Port'])

if df_tagged.empty:
    raise ValueError("Zero tagged rows after merge.")

del df_merge_right, df_anchors, df_anchors_clean
gc.collect()

# ==============================================================================
# 5. EXECUTION: AGGREGATE RETURNS (DAILY)
# ==============================================================================
print("\nCalculating Base Portfolio Returns...")

# ### CHANGE: Removed PCUR and HistCurrency
base_keys = [COLS['date'], COLS['country']]

# ### CHANGE: Removed Size-BM Portfolios calculation
# print("   ... Size-BM Sorts")
# port_bm = ...

# 2. Size-OP Portfolios
print("   ... Size-OP Sorts")
port_op = df_tagged.dropna(subset=['Size_Port', 'OP_Port'])\
    .groupby(base_keys + ['Size_Port', 'OP_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_op.name = 'Ret_OP'

# 3. Size-Inv Portfolios
print("   ... Size-Inv Sorts")
port_inv = df_tagged.dropna(subset=['Size_Port', 'Inv_Port'])\
    .groupby(base_keys + ['Size_Port', 'Inv_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_inv.name = 'Ret_Inv'

# 4. Size-Mom Portfolios
print("   ... Size-Mom Sorts")
port_mom = df_tagged.dropna(subset=['Size_Port', 'Mom_Port'])\
    .groupby(base_keys + ['Size_Port', 'Mom_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_mom.name = 'Ret_Mom'

# Pivot Data
print("Pivoting data...")
# ### CHANGE: Removed BM pivot
# df_p_bm = pivot_portfolios(port_bm, 'BM_Port', base_keys)
df_p_op = pivot_portfolios(port_op, 'OP_Port', base_keys)
df_p_inv = pivot_portfolios(port_inv, 'Inv_Port', base_keys)
df_p_mom = pivot_portfolios(port_mom, 'Mom_Port', base_keys)

# ==============================================================================
# 6. EXECUTION: FACTOR CONSTRUCTION
# ==============================================================================
print("Constructing Factors...")

if df_p_op.empty: # ### CHANGE: Checked OP instead of BM
    print("WARNING: No Portfolio Returns calculated. Factors will be empty.")
    factors = pd.DataFrame()
else:
    # ### CHANGE: Use OP index instead of BM
    factors = pd.DataFrame(index=df_p_op.index)
    smb_components = pd.DataFrame(index=df_p_op.index)

    # ### CHANGE: Removed HML Construction
    # 1. HML & SMB (BM) ...

    # 2. RMW & SMB (OP)
    if all(c in df_p_op.columns for c in ['S_R', 'B_R', 'S_W', 'B_W']):
        factors['RMW'] = 0.5 * (df_p_op['S_R'] + df_p_op['B_R']) - \
                        0.5 * (df_p_op['S_W'] + df_p_op['B_W'])
        smb_components['SMB_OP'] = (df_p_op['S_R'] + df_p_op['S_N'] + df_p_op['S_W']) / 3 - \
                                (df_p_op['B_R'] + df_p_op['B_N'] + df_p_op['B_W']) / 3
    else:
        factors['RMW'] = np.nan
        smb_components['SMB_OP'] = np.nan

    # 3. CMA & SMB (Inv)
    if all(c in df_p_inv.columns for c in ['S_C', 'B_C', 'S_A', 'B_A']):
        factors['CMA'] = 0.5 * (df_p_inv['S_C'] + df_p_inv['B_C']) - \
                        0.5 * (df_p_inv['S_A'] + df_p_inv['B_A'])
        smb_components['SMB_INV'] = (df_p_inv['S_C'] + df_p_inv['S_N'] + df_p_inv['S_A']) / 3 - \
                                    (df_p_inv['B_C'] + df_p_inv['B_N'] + df_p_inv['B_A']) / 3
    else:
        factors['CMA'] = np.nan
        smb_components['SMB_INV'] = np.nan

    # 4. MOM
    if all(c in df_p_mom.columns for c in ['S_H', 'B_H', 'S_L', 'B_L']):
        factors['Mom'] = 0.5 * (df_p_mom['S_H'] + df_p_mom['B_H']) - \
                        0.5 * (df_p_mom['S_L'] + df_p_mom['B_L'])
    else:
        factors['Mom'] = np.nan

    # 5. CONSOLIDATE SMB
    # ### CHANGE: Removed SMB_BM from average
    factors['SMB'] = smb_components[['SMB_OP', 'SMB_INV']].mean(axis=1)

    # 6. MARKET RETURN
    print("   ... Calculating Market Return")
    mkt_info = df_tagged.groupby(base_keys).apply(calc_mkt_rf, include_groups=False)

    factors = factors.join(mkt_info)
    factors['Mkt-RF'] = factors['Mkt'] - factors['RF']

# ==============================================================================
# 7. OUTPUT & VERIFICATION
# ==============================================================================
if not factors.empty:
    factors_final = factors.reset_index()
    print(f"Saving Final Factors to TXT: {OUTPUT_FILE}")
    factors_final.to_csv(OUTPUT_FILE, sep='|', index=False)

    print("-" * 80)
    print("VERIFICATION CHECK (ANNUAL)")
    print("-" * 80)

    # ### CHANGE: Removed PCUR/HistCurrency filter check
    mask_ver = (
        (factors_final[COLS['date']] >= '2000-07-31') &
        (factors_final[COLS['date']] <= '2000-08-09') &
        (factors_final['Country'] == '280') 
    )

    verification_subset = factors_final[mask_ver]
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(verification_subset)
else:
    print("Factor DataFrame is empty. No output generated.")

print("\nProcess Complete.")

Loading input file: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean.txt...
Calculating variables...
Calculating Momentum...

=== STEP 4: ANNUAL REBALANCING LOGIC ===
Calculing Portfolio Year for Daily Data...
Identifying Anchor Dates (Best data near June 30th)...
Identified 488570 anchor rows.
Assigning Portfolios on Anchor Dates...
Rows with valid Market Cap: 488570
Annual Assignment Done.

Rolling Forward: Merging Anchors to Daily Data...
Merge Complete.
  Total Daily Rows: 128221039
  Tagged Daily Rows: 120175968 (93.7%)

Calculating Base Portfolio Returns...
   ... Size-OP Sorts
   ... Size-Inv Sorts
   ... Size-Mom Sorts
Pivoting data...
Constructing Factors...
   ... Calculating Market Return
Saving Final Factors to TXT: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/Factors_Annual_Country.txt
--------------------------------------------------------------------------------
VERIFICATION CHECK (ANNUAL)
---------------------------

### Ohne B/M Global

In [126]:
import numpy as np
import pandas as pd
import os
import sys
import warnings
import gc

# Suppress "All-NaN slice encountered" and other noisy warnings
warnings.filterwarnings('ignore')

# ==============================================================================
# 1. CONFIGURATION & SETUP
# ==============================================================================
INPUT_FILENAME = "FF_Benchmark_Factors_Merged_Clean.txt"  

# ### CHANGE: Updated Filename for Global
OUTPUT_FILENAME = "Factors_Annual_Global.txt"

# Define Paths
try:
    path_base = f"{Temp_file_path_A}"
except NameError:
    path_base = "." 

INPUT_FILE = os.path.join(path_base, INPUT_FILENAME)
OUTPUT_FILE = os.path.join(path_base, OUTPUT_FILENAME)

# Column Mapping
COLS = {
    'date': 'DayDate', 'id': 'ID', 'country': 'Country',
    'pcur': 'PCUR', 'hist_curr': 'HistCurrency',
    'ret': 'ret_bps', 
    'mv': 'MV_USD', # Keeping MV_USD
    'rf': 'rf',
    'be': 'be', 'op': 'op', 'inv': 'inv', 'mom': 'mom_signal',
    # Ingredients
    'rev': 'rev', 'cogs': 'cogs', 'sga': 'sga', 'int': 'int',
    'at': 'at', 'at_lag1': 'at_lag1'
}

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================
def assign_portfolios_robust(slice_df):
    """
    Assigns Size and Style groups. Handles missing columns gracefully.
    """
    expected_cols = ['Size_Port', 'OP_Port', 'Inv_Port', 'Mom_Port']
    for c in expected_cols:
        if c not in slice_df.columns:
            slice_df[c] = np.nan

    if len(slice_df) < 2:
        return slice_df 

    # 1. SIZE ASSIGNMENT (Global Sort)
    if slice_df[COLS['mv']].isna().all():
        return slice_df
        
    slice_df = slice_df.sort_values(COLS['mv'], ascending=False)
    cum_mv_pct = slice_df[COLS['mv']].cumsum() / slice_df[COLS['mv']].sum()
    
    is_big = cum_mv_pct <= 0.90
    slice_df['Size_Port'] = np.where(is_big, 'B', 'S')
    mask_b = is_big

    # 2. STYLE ASSIGNMENTS
    def assign_style(col_name, out_col_name, labels=['L', 'N', 'H']):
        vals = slice_df[col_name].values
        big_vals = vals[mask_b]
        
        valid_big = big_vals[~np.isnan(big_vals)]
        
        if len(valid_big) == 0:
             # Fallback
             valid_all = vals[~np.isnan(vals)]
             if len(valid_all) == 0:
                 slice_df[out_col_name] = None
                 return
             p30, p70 = np.percentile(valid_all, [30, 70])
        else:
             p30, p70 = np.percentile(valid_big, [30, 70])
        
        conditions = [vals <= p30, (vals > p30) & (vals < p70), vals >= p70]
        slice_df[out_col_name] = np.select(conditions, labels, default=None)

    # (BM Removed)
    if 'op' in slice_df.columns: assign_style('op', 'OP_Port', ['W', 'N', 'R'])
    if 'inv' in slice_df.columns: assign_style('inv', 'Inv_Port', ['C', 'N', 'A'])
    if 'mom_signal' in slice_df.columns: assign_style('mom_signal', 'Mom_Port')
        
    return slice_df

def calc_vw_ret(df_sub):
    if len(df_sub) == 0: return np.nan
    val = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    return val

def pivot_portfolios(port_series, port_col_name, base_keys):
    if port_series.empty:
        return pd.DataFrame()

    df_p = port_series.reset_index()
    s_port = df_p['Size_Port'].astype(str)
    style_port = df_p[port_col_name].astype(str)
    df_p['Port_Label'] = s_port + '_' + style_port
    
    return df_p.pivot_table(index=base_keys, columns='Port_Label', values=port_series.name)

def calc_mkt_rf(df_sub):
    if len(df_sub) == 0 or df_sub[COLS['mv']].sum() == 0:
        return pd.Series({'Mkt': np.nan, 'RF': np.nan})
    mkt = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    rf_col = COLS['rf']
    if rf_col in df_sub.columns:
        rf = df_sub[rf_col].iloc[0]
    else:
        rf = 0.0
    return pd.Series({'Mkt': mkt, 'RF': rf})

# ==============================================================================
# 3. EXECUTION: LOAD & PREP
# ==============================================================================
print(f"Loading input file: {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE, sep='|', dtype={'ID': str, 'Country': str, 'PCUR': str, 'HistCurrency': str})
df[COLS['date']] = pd.to_datetime(df[COLS['date']])

if COLS['ret'] in df.columns:
    df['ret_decimal'] = df[COLS['ret']] / 10000.0
else:
    raise ValueError(f"Could not find Return column '{COLS['ret']}'")

for c in [COLS['mv'], COLS['rf'], COLS['be']]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

print("Calculating variables...")
# A) OP
if 'op' not in df.columns:
    mandatory_op_cols = [COLS['rev'], COLS['cogs'], COLS['sga'], COLS['be']]
    if all(c in df.columns for c in mandatory_op_cols):
        rev = df[COLS['rev']]
        cogs = df[COLS['cogs']]
        sga = df[COLS['sga']]
        be = df[COLS['be']]
        interest = df[COLS['int']].fillna(0) if COLS['int'] in df.columns else 0
        df['op'] = (rev - cogs - sga - interest) / be

# B) Inv
if 'inv' not in df.columns:
    if all(c in df.columns for c in [COLS['at'], COLS['at_lag1']]):
        df['inv'] = (df[COLS['at']] - df[COLS['at_lag1']]) / df[COLS['at_lag1']]

# (BM Removed)

# D) Momentum
print("Calculating Momentum...")
df = df.sort_values([COLS['id'], COLS['pcur'], COLS['date']])
df['log_ret'] = np.log1p(df['ret_decimal'].fillna(0))
df['mom_signal'] = df.groupby([COLS['id'], COLS['pcur']])['log_ret'].transform(
    lambda x: x.rolling(window=250, min_periods=100).sum().shift(21)
)
df['mom_signal'] = np.exp(df['mom_signal']) - 1

# ==============================================================================
# 4. EXECUTION: ANNUAL PORTFOLIO ASSIGNMENT
# ==============================================================================
print("\n=== STEP 4: ANNUAL REBALANCING LOGIC ===")

# 4A. DEFINE "PORTFOLIO YEAR"
print("Calculing Portfolio Year for Daily Data...")
df['Port_Year'] = np.where(df[COLS['date']].dt.month >= 7, 
                           df[COLS['date']].dt.year, 
                           df[COLS['date']].dt.year - 1).astype(int)

# 4B. FIND ANCHOR DATA
print("Identifying Anchor Dates (Best data near June 30th)...")
candidates = df[df[COLS['date']].dt.month.isin([6, 7])].copy()
candidates['Anchor_Year'] = candidates[COLS['date']].dt.year
candidates['Target_Date'] = pd.to_datetime(candidates['Anchor_Year'].astype(str) + "-06-30")
candidates['Diff'] = (candidates[COLS['date']] - candidates['Target_Date']).abs()
candidates = candidates.sort_values([COLS['id'], 'Anchor_Year', 'Diff'])
df_anchors = candidates.groupby([COLS['id'], 'Anchor_Year'], as_index=False).first()

print(f"Identified {len(df_anchors)} anchor rows.")
del candidates
gc.collect()

# 4C. ASSIGN PORTFOLIOS ON ANCHOR DATA
print("Assigning Portfolios on Anchor Dates...")

# ### CHANGE: Group ONLY by Anchor_Year (Global Sort)
group_keys = ['Anchor_Year']

df_anchors_clean = df_anchors.dropna(subset=[COLS['mv']]).copy()
print(f"Rows with valid Market Cap: {len(df_anchors_clean)}")

if df_anchors_clean.empty:
    raise ValueError("CRITICAL: No valid anchor data (Market Cap is NaN).")

# Run Assignment
df_tagged_anchors = df_anchors_clean.groupby(group_keys, group_keys=False).apply(assign_portfolios_robust)

print("Annual Assignment Done.")

# 4D. THE MERGE (ROLL FORWARD)
print("\nRolling Forward: Merging Anchors to Daily Data...")

cols_to_keep = [COLS['id'], 'Anchor_Year', 'Size_Port', 'OP_Port', 'Inv_Port', 'Mom_Port']
cols_to_keep = [c for c in cols_to_keep if c in df_tagged_anchors.columns]

df_merge_right = df_tagged_anchors[cols_to_keep].copy()
df_merge_right = df_merge_right.rename(columns={'Anchor_Year': 'Port_Year'})

# Perform Merge
df_tagged = pd.merge(df, df_merge_right, on=[COLS['id'], 'Port_Year'], how='left')

# Check results
n_total = len(df_tagged)
n_tagged = df_tagged['Size_Port'].notna().sum()
print(f"Merge Complete.")
print(f"  Total Daily Rows: {n_total}")
print(f"  Tagged Daily Rows: {n_tagged} ({(n_tagged/n_total)*100:.1f}%)")

df_tagged = df_tagged.dropna(subset=['Size_Port'])

if df_tagged.empty:
    raise ValueError("Zero tagged rows after merge.")

del df_merge_right, df_anchors, df_anchors_clean
gc.collect()

# ==============================================================================
# 5. EXECUTION: AGGREGATE RETURNS (DAILY)
# ==============================================================================
print("\nCalculating Base Portfolio Returns...")

# ### CHANGE: Group ONLY by Date (Global Aggregation)
base_keys = [COLS['date']]

# 2. Size-OP
print("   ... Size-OP Sorts")
port_op = df_tagged.dropna(subset=['Size_Port', 'OP_Port'])\
    .groupby(base_keys + ['Size_Port', 'OP_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_op.name = 'Ret_OP'

# 3. Size-Inv
print("   ... Size-Inv Sorts")
port_inv = df_tagged.dropna(subset=['Size_Port', 'Inv_Port'])\
    .groupby(base_keys + ['Size_Port', 'Inv_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_inv.name = 'Ret_Inv'

# 4. Size-Mom
print("   ... Size-Mom Sorts")
port_mom = df_tagged.dropna(subset=['Size_Port', 'Mom_Port'])\
    .groupby(base_keys + ['Size_Port', 'Mom_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_mom.name = 'Ret_Mom'

# Pivot Data
print("Pivoting data...")
df_p_op = pivot_portfolios(port_op, 'OP_Port', base_keys)
df_p_inv = pivot_portfolios(port_inv, 'Inv_Port', base_keys)
df_p_mom = pivot_portfolios(port_mom, 'Mom_Port', base_keys)

# ==============================================================================
# 6. EXECUTION: FACTOR CONSTRUCTION
# ==============================================================================
print("Constructing Factors...")

if df_p_op.empty:
    print("WARNING: No Portfolio Returns calculated. Factors will be empty.")
    factors = pd.DataFrame()
else:
    factors = pd.DataFrame(index=df_p_op.index)
    smb_components = pd.DataFrame(index=df_p_op.index)

    # (HML Removed)

    # 2. RMW & SMB (OP)
    if all(c in df_p_op.columns for c in ['S_R', 'B_R', 'S_W', 'B_W']):
        factors['RMW'] = 0.5 * (df_p_op['S_R'] + df_p_op['B_R']) - \
                        0.5 * (df_p_op['S_W'] + df_p_op['B_W'])
        smb_components['SMB_OP'] = (df_p_op['S_R'] + df_p_op['S_N'] + df_p_op['S_W']) / 3 - \
                                (df_p_op['B_R'] + df_p_op['B_N'] + df_p_op['B_W']) / 3
    else:
        factors['RMW'] = np.nan
        smb_components['SMB_OP'] = np.nan

    # 3. CMA & SMB (Inv)
    if all(c in df_p_inv.columns for c in ['S_C', 'B_C', 'S_A', 'B_A']):
        factors['CMA'] = 0.5 * (df_p_inv['S_C'] + df_p_inv['B_C']) - \
                        0.5 * (df_p_inv['S_A'] + df_p_inv['B_A'])
        smb_components['SMB_INV'] = (df_p_inv['S_C'] + df_p_inv['S_N'] + df_p_inv['S_A']) / 3 - \
                                    (df_p_inv['B_C'] + df_p_inv['B_N'] + df_p_inv['B_A']) / 3
    else:
        factors['CMA'] = np.nan
        smb_components['SMB_INV'] = np.nan

    # 4. MOM
    if all(c in df_p_mom.columns for c in ['S_H', 'B_H', 'S_L', 'B_L']):
        factors['Mom'] = 0.5 * (df_p_mom['S_H'] + df_p_mom['B_H']) - \
                        0.5 * (df_p_mom['S_L'] + df_p_mom['B_L'])
    else:
        factors['Mom'] = np.nan

    # 5. CONSOLIDATE SMB
    factors['SMB'] = smb_components[['SMB_OP', 'SMB_INV']].mean(axis=1)

    # 6. MARKET RETURN
    print("   ... Calculating Global Market Return")
    mkt_info = df_tagged.groupby(base_keys).apply(calc_mkt_rf, include_groups=False)

    factors = factors.join(mkt_info)
    factors['Mkt-RF'] = factors['Mkt'] - factors['RF']

# ==============================================================================
# 7. OUTPUT & VERIFICATION
# ==============================================================================
if not factors.empty:
    factors_final = factors.reset_index()
    print(f"Saving Final Factors to TXT: {OUTPUT_FILE}")
    factors_final.to_csv(OUTPUT_FILE, sep='|', index=False)

    print("-" * 80)
    print("VERIFICATION CHECK (ANNUAL - GLOBAL)")
    print("-" * 80)

    # ### CHANGE: Removed Country filter
    mask_ver = (
        (factors_final[COLS['date']] >= '2000-07-31') &
        (factors_final[COLS['date']] <= '2000-08-09') 
    )

    verification_subset = factors_final[mask_ver]
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(verification_subset)
else:
    print("Factor DataFrame is empty. No output generated.")

print("\nProcess Complete.")

Loading input file: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean.txt...
Calculating variables...
Calculating Momentum...

=== STEP 4: ANNUAL REBALANCING LOGIC ===
Calculing Portfolio Year for Daily Data...
Identifying Anchor Dates (Best data near June 30th)...
Identified 488570 anchor rows.
Assigning Portfolios on Anchor Dates...
Rows with valid Market Cap: 488570
Annual Assignment Done.

Rolling Forward: Merging Anchors to Daily Data...
Merge Complete.
  Total Daily Rows: 128221039
  Tagged Daily Rows: 120178335 (93.7%)

Calculating Base Portfolio Returns...
   ... Size-OP Sorts
   ... Size-Inv Sorts
   ... Size-Mom Sorts
Pivoting data...
Constructing Factors...
   ... Calculating Global Market Return
Saving Final Factors to TXT: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/Factors_Annual_Global.txt
--------------------------------------------------------------------------------
VERIFICATION CHECK (ANNUAL - GLOBAL)
------------

### Ohne B/M Replication

In [127]:
import numpy as np
import pandas as pd
import os
import sys
import warnings
import gc

# Suppress "All-NaN slice encountered" and other noisy warnings
warnings.filterwarnings('ignore')

# ==============================================================================
# 1. CONFIGURATION & SETUP
# ==============================================================================
INPUT_FILENAME = "FF_Benchmark_Factors_Merged_Clean_Replication.txt"  

# ### CHANGE: Updated Filename for Replication
OUTPUT_FILENAME = "Factors_Annual_Replication.txt"

# Define Paths
try:
    path_base = f"{Temp_file_path_A}"
except NameError:
    path_base = "." 

INPUT_FILE = os.path.join(path_base, INPUT_FILENAME)
OUTPUT_FILE = os.path.join(path_base, OUTPUT_FILENAME)

# Column Mapping
COLS = {
    'date': 'DayDate', 'id': 'ID', 'country': 'Country',
    'pcur': 'PCUR', 'hist_curr': 'HistCurrency',
    'ret': 'ret_bps', 
    'mv': 'MV_USD', # Keeping MV_USD
    'rf': 'rf',
    'be': 'be', 'op': 'op', 'inv': 'inv', 'mom': 'mom_signal',
    # Ingredients
    'rev': 'rev', 'cogs': 'cogs', 'sga': 'sga', 'int': 'int',
    'at': 'at', 'at_lag1': 'at_lag1'
}

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================
def assign_portfolios_robust(slice_df):
    """
    Assigns Size and Style groups. Handles missing columns gracefully.
    """
    expected_cols = ['Size_Port', 'OP_Port', 'Inv_Port', 'Mom_Port']
    for c in expected_cols:
        if c not in slice_df.columns:
            slice_df[c] = np.nan

    if len(slice_df) < 2:
        return slice_df 

    # 1. SIZE ASSIGNMENT (Global Sort)
    if slice_df[COLS['mv']].isna().all():
        return slice_df
        
    slice_df = slice_df.sort_values(COLS['mv'], ascending=False)
    cum_mv_pct = slice_df[COLS['mv']].cumsum() / slice_df[COLS['mv']].sum()
    
    is_big = cum_mv_pct <= 0.90
    slice_df['Size_Port'] = np.where(is_big, 'B', 'S')
    mask_b = is_big

    # 2. STYLE ASSIGNMENTS
    def assign_style(col_name, out_col_name, labels=['L', 'N', 'H']):
        vals = slice_df[col_name].values
        big_vals = vals[mask_b]
        
        valid_big = big_vals[~np.isnan(big_vals)]
        
        if len(valid_big) == 0:
             # Fallback
             valid_all = vals[~np.isnan(vals)]
             if len(valid_all) == 0:
                 slice_df[out_col_name] = None
                 return
             p30, p70 = np.percentile(valid_all, [30, 70])
        else:
             p30, p70 = np.percentile(valid_big, [30, 70])
        
        conditions = [vals <= p30, (vals > p30) & (vals < p70), vals >= p70]
        slice_df[out_col_name] = np.select(conditions, labels, default=None)

    # (BM Removed)
    if 'op' in slice_df.columns: assign_style('op', 'OP_Port', ['W', 'N', 'R'])
    if 'inv' in slice_df.columns: assign_style('inv', 'Inv_Port', ['C', 'N', 'A'])
    if 'mom_signal' in slice_df.columns: assign_style('mom_signal', 'Mom_Port')
        
    return slice_df

def calc_vw_ret(df_sub):
    if len(df_sub) == 0: return np.nan
    val = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    return val

def pivot_portfolios(port_series, port_col_name, base_keys):
    if port_series.empty:
        return pd.DataFrame()

    df_p = port_series.reset_index()
    s_port = df_p['Size_Port'].astype(str)
    style_port = df_p[port_col_name].astype(str)
    df_p['Port_Label'] = s_port + '_' + style_port
    
    return df_p.pivot_table(index=base_keys, columns='Port_Label', values=port_series.name)

def calc_mkt_rf(df_sub):
    if len(df_sub) == 0 or df_sub[COLS['mv']].sum() == 0:
        return pd.Series({'Mkt': np.nan, 'RF': np.nan})
    mkt = np.average(df_sub['ret_decimal'], weights=df_sub[COLS['mv']])
    rf_col = COLS['rf']
    if rf_col in df_sub.columns:
        rf = df_sub[rf_col].iloc[0]
    else:
        rf = 0.0
    return pd.Series({'Mkt': mkt, 'RF': rf})

# ==============================================================================
# 3. EXECUTION: LOAD & PREP
# ==============================================================================
print(f"Loading input file: {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE, sep='|', dtype={'ID': str, 'Country': str, 'PCUR': str, 'HistCurrency': str})
df[COLS['date']] = pd.to_datetime(df[COLS['date']])

if COLS['ret'] in df.columns:
    df['ret_decimal'] = df[COLS['ret']] / 10000.0
else:
    raise ValueError(f"Could not find Return column '{COLS['ret']}'")

for c in [COLS['mv'], COLS['rf'], COLS['be']]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

print("Calculating variables...")
# A) OP
if 'op' not in df.columns:
    mandatory_op_cols = [COLS['rev'], COLS['cogs'], COLS['sga'], COLS['be']]
    if all(c in df.columns for c in mandatory_op_cols):
        rev = df[COLS['rev']]
        cogs = df[COLS['cogs']]
        sga = df[COLS['sga']]
        be = df[COLS['be']]
        interest = df[COLS['int']].fillna(0) if COLS['int'] in df.columns else 0
        df['op'] = (rev - cogs - sga - interest) / be

# B) Inv
if 'inv' not in df.columns:
    if all(c in df.columns for c in [COLS['at'], COLS['at_lag1']]):
        df['inv'] = (df[COLS['at']] - df[COLS['at_lag1']]) / df[COLS['at_lag1']]

# (BM Removed)

# D) Momentum
print("Calculating Momentum...")
df = df.sort_values([COLS['id'], COLS['pcur'], COLS['date']])
df['log_ret'] = np.log1p(df['ret_decimal'].fillna(0))
df['mom_signal'] = df.groupby([COLS['id'], COLS['pcur']])['log_ret'].transform(
    lambda x: x.rolling(window=250, min_periods=100).sum().shift(21)
)
df['mom_signal'] = np.exp(df['mom_signal']) - 1

# ==============================================================================
# 4. EXECUTION: ANNUAL PORTFOLIO ASSIGNMENT
# ==============================================================================
print("\n=== STEP 4: ANNUAL REBALANCING LOGIC ===")

# 4A. DEFINE "PORTFOLIO YEAR"
print("Calculing Portfolio Year for Daily Data...")
df['Port_Year'] = np.where(df[COLS['date']].dt.month >= 7, 
                           df[COLS['date']].dt.year, 
                           df[COLS['date']].dt.year - 1).astype(int)

# 4B. FIND ANCHOR DATA
print("Identifying Anchor Dates (Best data near June 30th)...")
candidates = df[df[COLS['date']].dt.month.isin([6, 7])].copy()
candidates['Anchor_Year'] = candidates[COLS['date']].dt.year
candidates['Target_Date'] = pd.to_datetime(candidates['Anchor_Year'].astype(str) + "-06-30")
candidates['Diff'] = (candidates[COLS['date']] - candidates['Target_Date']).abs()
candidates = candidates.sort_values([COLS['id'], 'Anchor_Year', 'Diff'])
df_anchors = candidates.groupby([COLS['id'], 'Anchor_Year'], as_index=False).first()

print(f"Identified {len(df_anchors)} anchor rows.")
del candidates
gc.collect()

# 4C. ASSIGN PORTFOLIOS ON ANCHOR DATA
print("Assigning Portfolios on Anchor Dates...")

# ### CHANGE: Group ONLY by Anchor_Year (Global Sort)
group_keys = ['Anchor_Year']

df_anchors_clean = df_anchors.dropna(subset=[COLS['mv']]).copy()
print(f"Rows with valid Market Cap: {len(df_anchors_clean)}")

if df_anchors_clean.empty:
    raise ValueError("CRITICAL: No valid anchor data (Market Cap is NaN).")

# Run Assignment
df_tagged_anchors = df_anchors_clean.groupby(group_keys, group_keys=False).apply(assign_portfolios_robust)

print("Annual Assignment Done.")

# 4D. THE MERGE (ROLL FORWARD)
print("\nRolling Forward: Merging Anchors to Daily Data...")

cols_to_keep = [COLS['id'], 'Anchor_Year', 'Size_Port', 'OP_Port', 'Inv_Port', 'Mom_Port']
cols_to_keep = [c for c in cols_to_keep if c in df_tagged_anchors.columns]

df_merge_right = df_tagged_anchors[cols_to_keep].copy()
df_merge_right = df_merge_right.rename(columns={'Anchor_Year': 'Port_Year'})

# Perform Merge
df_tagged = pd.merge(df, df_merge_right, on=[COLS['id'], 'Port_Year'], how='left')

# Check results
n_total = len(df_tagged)
n_tagged = df_tagged['Size_Port'].notna().sum()
print(f"Merge Complete.")
print(f"  Total Daily Rows: {n_total}")
print(f"  Tagged Daily Rows: {n_tagged} ({(n_tagged/n_total)*100:.1f}%)")

df_tagged = df_tagged.dropna(subset=['Size_Port'])

if df_tagged.empty:
    raise ValueError("Zero tagged rows after merge.")

del df_merge_right, df_anchors, df_anchors_clean
gc.collect()

# ==============================================================================
# 5. EXECUTION: AGGREGATE RETURNS (DAILY)
# ==============================================================================
print("\nCalculating Base Portfolio Returns...")

# ### CHANGE: Group ONLY by Date (Global Aggregation)
base_keys = [COLS['date']]

# 2. Size-OP
print("   ... Size-OP Sorts")
port_op = df_tagged.dropna(subset=['Size_Port', 'OP_Port'])\
    .groupby(base_keys + ['Size_Port', 'OP_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_op.name = 'Ret_OP'

# 3. Size-Inv
print("   ... Size-Inv Sorts")
port_inv = df_tagged.dropna(subset=['Size_Port', 'Inv_Port'])\
    .groupby(base_keys + ['Size_Port', 'Inv_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_inv.name = 'Ret_Inv'

# 4. Size-Mom
print("   ... Size-Mom Sorts")
port_mom = df_tagged.dropna(subset=['Size_Port', 'Mom_Port'])\
    .groupby(base_keys + ['Size_Port', 'Mom_Port'])\
    .apply(calc_vw_ret, include_groups=False)
port_mom.name = 'Ret_Mom'

# Pivot Data
print("Pivoting data...")
df_p_op = pivot_portfolios(port_op, 'OP_Port', base_keys)
df_p_inv = pivot_portfolios(port_inv, 'Inv_Port', base_keys)
df_p_mom = pivot_portfolios(port_mom, 'Mom_Port', base_keys)

# ==============================================================================
# 6. EXECUTION: FACTOR CONSTRUCTION
# ==============================================================================
print("Constructing Factors...")

if df_p_op.empty:
    print("WARNING: No Portfolio Returns calculated. Factors will be empty.")
    factors = pd.DataFrame()
else:
    factors = pd.DataFrame(index=df_p_op.index)
    smb_components = pd.DataFrame(index=df_p_op.index)

    # (HML Removed)

    # 2. RMW & SMB (OP)
    if all(c in df_p_op.columns for c in ['S_R', 'B_R', 'S_W', 'B_W']):
        factors['RMW'] = 0.5 * (df_p_op['S_R'] + df_p_op['B_R']) - \
                        0.5 * (df_p_op['S_W'] + df_p_op['B_W'])
        smb_components['SMB_OP'] = (df_p_op['S_R'] + df_p_op['S_N'] + df_p_op['S_W']) / 3 - \
                                (df_p_op['B_R'] + df_p_op['B_N'] + df_p_op['B_W']) / 3
    else:
        factors['RMW'] = np.nan
        smb_components['SMB_OP'] = np.nan

    # 3. CMA & SMB (Inv)
    if all(c in df_p_inv.columns for c in ['S_C', 'B_C', 'S_A', 'B_A']):
        factors['CMA'] = 0.5 * (df_p_inv['S_C'] + df_p_inv['B_C']) - \
                        0.5 * (df_p_inv['S_A'] + df_p_inv['B_A'])
        smb_components['SMB_INV'] = (df_p_inv['S_C'] + df_p_inv['S_N'] + df_p_inv['S_A']) / 3 - \
                                    (df_p_inv['B_C'] + df_p_inv['B_N'] + df_p_inv['B_A']) / 3
    else:
        factors['CMA'] = np.nan
        smb_components['SMB_INV'] = np.nan

    # 4. MOM
    if all(c in df_p_mom.columns for c in ['S_H', 'B_H', 'S_L', 'B_L']):
        factors['Mom'] = 0.5 * (df_p_mom['S_H'] + df_p_mom['B_H']) - \
                        0.5 * (df_p_mom['S_L'] + df_p_mom['B_L'])
    else:
        factors['Mom'] = np.nan

    # 5. CONSOLIDATE SMB
    factors['SMB'] = smb_components[['SMB_OP', 'SMB_INV']].mean(axis=1)

    # 6. MARKET RETURN
    print("   ... Calculating Global Market Return")
    mkt_info = df_tagged.groupby(base_keys).apply(calc_mkt_rf, include_groups=False)

    factors = factors.join(mkt_info)
    factors['Mkt-RF'] = factors['Mkt'] - factors['RF']

# ==============================================================================
# 7. OUTPUT & VERIFICATION
# ==============================================================================
if not factors.empty:
    factors_final = factors.reset_index()
    print(f"Saving Final Factors to TXT: {OUTPUT_FILE}")
    factors_final.to_csv(OUTPUT_FILE, sep='|', index=False)

    print("-" * 80)
    print("VERIFICATION CHECK (ANNUAL - GLOBAL)")
    print("-" * 80)

    # ### CHANGE: Removed Country filter
    mask_ver = (
        (factors_final[COLS['date']] >= '2000-07-31') &
        (factors_final[COLS['date']] <= '2000-08-09') 
    )

    verification_subset = factors_final[mask_ver]
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(verification_subset)
else:
    print("Factor DataFrame is empty. No output generated.")

print("\nProcess Complete.")

Loading input file: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/FF_Benchmark_Factors_Merged_Clean_Replication.txt...
Calculating variables...
Calculating Momentum...

=== STEP 4: ANNUAL REBALANCING LOGIC ===
Calculing Portfolio Year for Daily Data...
Identifying Anchor Dates (Best data near June 30th)...
Identified 101256 anchor rows.
Assigning Portfolios on Anchor Dates...
Rows with valid Market Cap: 101256
Annual Assignment Done.

Rolling Forward: Merging Anchors to Daily Data...
Merge Complete.
  Total Daily Rows: 26598430
  Tagged Daily Rows: 24883669 (93.6%)

Calculating Base Portfolio Returns...
   ... Size-OP Sorts
   ... Size-Inv Sorts
   ... Size-Mom Sorts
Pivoting data...
Constructing Factors...
   ... Calculating Global Market Return
Saving Final Factors to TXT: /home/jovyan/work/hpool1/pseidel/test/Temp/TempAnomalies/Factors_Annual_Replication.txt
--------------------------------------------------------------------------------
VERIFICATION CHECK (ANNUAL - GLOBA