### Mount Google Drive, Import Libraries and Define Paths

In [255]:
# =============================================================================
# ENVIRONMENT SETUP + PATH CONFIGURATION (SERVER / COLAB COMPATIBLE)
# =============================================================================

import os
import sys
import importlib
from pathlib import Path
import string
import re
import gc
from datetime import timedelta
from scipy.stats.mstats import winsorize

# -----------------------------------------------------------------------------
# 0) HARD SAFETY: cap native thread usage (prevents pthread_create EAGAIN)
#    MUST be set before importing numpy / scipy / pandas
# -----------------------------------------------------------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["BLIS_NUM_THREADS"] = "1"

# -----------------------------------------------------------------------------
# 1) Detect environment
# -----------------------------------------------------------------------------
IN_COLAB = "google.colab" in sys.modules

# -----------------------------------------------------------------------------
# 2) (Colab only) Mount Google Drive
# -----------------------------------------------------------------------------
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    BASE_PATH = "/content/drive/MyDrive/Colab Notebooks"
else:
    # Server base path (your target)
    BASE_PATH = "/home/jovyan/work/hpool1/pseidel/test"

print("IN_COLAB:", IN_COLAB)
print("BASE_PATH:", BASE_PATH)

# -----------------------------------------------------------------------------
# 3) Sanity checks: path exists + write permission
# -----------------------------------------------------------------------------
BASE = Path(BASE_PATH)
if not BASE.exists():
    raise FileNotFoundError(f"BASE_PATH does not exist: {BASE}")

# quick write test (fails fast if you don't have permissions)
test_file = BASE / ".write_test_tmp"
try:
    test_file.write_text("ok", encoding="utf-8")
    test_file.unlink()
except Exception as e:
    raise PermissionError(f"No write permission in {BASE}. Error: {e}")

# -----------------------------------------------------------------------------
# 4) Environment check: ensure required packages import cleanly
# -----------------------------------------------------------------------------
required_packages = ["numpy", "scipy", "pandas", "linearmodels", "xlsxwriter"]

for pkg in required_packages:
    print(f"Importing {pkg} ...")
    importlib.import_module(pkg)
    print(f"{pkg} OK")

import numpy as np
import pandas as pd

# -----------------------------------------------------------------------------
# 5) Base paths and input/output locations
# -----------------------------------------------------------------------------
Input_file_path   = str(BASE / "Input")
Temp_file_path    = str(BASE / "Temp")
Output_file_path  = str(BASE / "Output")

Fundamentals_file_path = f"{Input_file_path}/WSFV_f_20250131.txt"
Current_file_path      = f"{Input_file_path}/WSCurrent_f_20250131.txt"
Calendar_file_path     = f"{Input_file_path}/WSCalendarPrd_f_20250131.txt"
Meta_file_path         = f"{Input_file_path}/WSMetaData_f_20250131.txt"
Excel_file_path        = f"{Input_file_path}/WS PIT Table Definitions V5 with start dates.xls"

MarketValues_file_path          = f"{Input_file_path}/Daily MV USD"
MarketValues_file_path_LC       = f"{Input_file_path}/Daily MV LC"
DailyTotalReturns_file_path     = f"{Input_file_path}/Daily Returns USD"
DailyIndexReturns_file_path     = f"{Input_file_path}/Daily Index Returns USD"
Constituents_file_path          = f"{Input_file_path}/Constituents.01.csv"
UniversalMatching_file_path     = f"{Input_file_path}/Universal Matching File"

Temp_file_path_GO  = f"{Temp_file_path}/TempGeneralOverview"
Temp_file_path_EoC = f"{Temp_file_path}/TempExtractionofCharacteristics"
Temp_file_path_DP  = f"{Temp_file_path}/TempDataPreparation"
Temp_file_path_A   = f"{Temp_file_path}/TempAnomalies"
Temp_file_path_R   = f"{Temp_file_path}/TempRegressionModel"

Relevant_items_path   = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_A = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_B = f"{Input_file_path}/RelevantItemsB.txt"
Relevant_items_path_C = f"{Input_file_path}/RelevantItemsC.txt"
Relevant_items_path_D = f"{Input_file_path}/RelevantItemsD.txt"

Subset_file_path = f"{Temp_file_path_GO}/Subsets"
Fundamentals_clean_file_path = f"{Temp_file_path_GO}/Fundamentals_clean.txt"
Current_clean_file_path      = f"{Temp_file_path_GO}/Current_clean.txt"
Calendar_clean_file_path     = f"{Temp_file_path_GO}/Input/Calendar_clean.txt"
Meta_clean_file_path         = f"{Temp_file_path_GO}/Input/Meta_clean.txt"

# -----------------------------------------------------------------------------
# 6) Ensure required directories exist
# -----------------------------------------------------------------------------
Path(Output_file_path).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_GO).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_EoC).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_A).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_R).mkdir(parents=True, exist_ok=True)
Path(Subset_file_path).mkdir(parents=True, exist_ok=True)
Path(Path(Calendar_clean_file_path).parent).mkdir(parents=True, exist_ok=True)

# -----------------------------------------------------------------------------
# 7) Streaming / deduplication settings
# -----------------------------------------------------------------------------
CHUNK_SIZE = 2_000_000
DATE_COL = "PIT Date"
DEDUP_KEYS = ["ID", "ItemCode", DATE_COL]

print("Paths configured. Temp outputs ->", Temp_file_path_GO)
print("Example input path ->", Fundamentals_file_path)


IN_COLAB: False
BASE_PATH: /home/jovyan/work/hpool1/pseidel/test
Importing numpy ...
numpy OK
Importing scipy ...
scipy OK
Importing pandas ...
pandas OK
Importing linearmodels ...
linearmodels OK
Importing xlsxwriter ...
xlsxwriter OK
Paths configured. Temp outputs -> /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview
Example input path -> /home/jovyan/work/hpool1/pseidel/test/Input/WSFV_f_20250131.txt


### Import Data Files to DataFrames

In [256]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell:
#
#   1. Defines a helper function `import_file_to_dataframe` that reads a pipe-delimited
#      text file into a pandas DataFrame (all columns as string; returns None on error).
#   2. Imports a list of "input" files from Input_file_path into DataFrames
#      (RelevantItems, CountryCodes, ...), storing them in globals() by filename.
#   3. Imports a list of "temp" files from Temp_file_path_EoC into DataFrames
#      (ADR_clean, CompanyName_clean, CurrencyCodes_clean, FYE_clean, ID_clean,
#       UpdateCodes_clean, ValueCoding), also stored in globals().
#   4. Identifies which subset_*.txt files exist in Subset_file_path based on the IDs
#      listed in RelevantItems.txt, and records their names (without .txt) in
#      `successful_subset_names`.
#
# No actual subset data is loaded here; that is deferred to later steps to keep
# memory usage under control.


# Function to import a file and return a pandas DataFrame
def import_file_to_dataframe(file_path):
    """
    Import a pipe-separated text file as a pandas DataFrame.

    - Uses sep='|' to read pipe-delimited files.
    - Reads all columns as strings (dtype=str), which helps preserve things like
      leading zeros in codes (e.g., NatCo, ItemCode).
    - Returns None on failure and prints a brief error message.
    """
    try:
        df = pd.read_csv(file_path, sep='|', dtype=str)
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error importing file {file_path}: {e}")
        return None


# -------------------------------------------------------------------------
# Import files from Input directory
# -------------------------------------------------------------------------
input_files_to_import = ["RelevantItems.txt", "CountryCodes.txt"]

for file_name in input_files_to_import:
    file_path = os.path.join(Input_file_path, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "RelevantItems"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Import files from Temp directory (end-of-cleaning stage)
# -------------------------------------------------------------------------
temp_files_to_import = [
    "ADR_clean.txt",
    "CompanyName_clean.txt",
    "CurrencyCodes_clean.txt",
    "FYE_clean.txt",
    "ID_clean.txt",
    "UpdateCodes_clean.txt",
    "ValueCoding.txt"
]

for file_name in temp_files_to_import:
    file_path = os.path.join(Temp_file_path_EoC, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "ADR_clean"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Identify subset files that exist for the relevant items
# -------------------------------------------------------------------------
successful_subset_names = []

if 'RelevantItems' in globals() and RelevantItems is not None:
    # Assume first column of RelevantItems holds the item IDs used in subset filenames
    relevant_ids = RelevantItems.iloc[:, 0].astype(str).tolist()

    print("\nIdentifying subset files to process...")
    for item_id in relevant_ids:
        file_name = f"subset_{item_id}.txt"
        file_path = os.path.join(Subset_file_path, file_name)

        # Check the existence of each candidate subset file
        if os.path.exists(file_path):
            successful_subset_names.append(f"subset_{item_id}")
            print(f"  Found {file_name}")
        else:
            print(f"  File not found: {file_name}. Skipping.")

    print(f"\nIdentified {len(successful_subset_names)} subset files for processing.")
else:
    print("RelevantItems DataFrame not found or is empty. Cannot identify subset files.")

# Note: actual loading and processing of subset files happens later, in
# batch-based steps, to manage memory usage.



Imported RelevantItems.txt as DataFrame 'RelevantItems'
Preview of 'RelevantItems':
  ItemCode
0    01001
1    01051
2    01075
3    01101
4    01151 


Imported CountryCodes.txt as DataFrame 'CountryCodes'
Preview of 'CountryCodes':
  NatCo ImplCountry
0   012     Algeria
1   440   Lithuania
2   025   Argentina
3   442  Luxembourg
4   036   Australia 


Imported ADR_clean.txt as DataFrame 'ADR_clean'
Preview of 'ADR_clean':
          ID ADRIndicator
0  C036F63D0            N
1  C056879S0            X
2  C2461T100            N
3  C2504O500            N
4  C250C9180            N 


Imported CompanyName_clean.txt as DataFrame 'CompanyName_clean'
Preview of 'CompanyName_clean':
          ID                               CompanyName
0  C00948205             AGRIFORCE GROWING SYSTEMS LTD
1  C02500770            PEUGEOT CITROEN ARGENTINA S.A.
2  C02520200  ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA
3  C02520220                       ALPARGATAS S.A.I.C.
4  C02520230               ALUAR ALUMINI

# 4.0. Extracting the most recent, annualized values per PIT Date (incl. Plausibility checks for the data)

## 4.1. Split according to source

In [257]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell processes a ValueCoding DataFrame and assigns a Category to each item
# (per sanitized item name), based on its data sources:
#
#   1. Validates that `ValueCoding` exists and is non-empty.
#   2. Sanitizes `ItemName` to a filesystem-safe `ItemName_Sanitized` (same rules as
#      used for filenames).
#   3. Normalizes the `Source` column (string type, trimmed).
#   4. Groups all distinct sources per `ItemName_Sanitized`.
#   5. Uses `decide_category` to map each sanitized name to a Category:
#        - Hardcoded overrides for certain items.
#        - Generic rules:
#             * presence of IS / Other  -> "Mixed"
#             * presence of Market / BS -> "Annualized"
#             * presence of CFS         -> "Special"
#        - otherwise                    -> None
#   6. Attaches the Category back to each row based on `ItemName_Sanitized`.
#   7. Creates three unique-item DataFrames:
#        - `annualized_items`
#        - `mixed_items`
#        - `special_items`
#   8. Exposes the processed objects in `globals()` for use in later cells.
#   9. Shows a sample and prints counts of each category.
#
# If `ValueCoding` is not present or is empty, processing is skipped.

# CELL 1 — Process ValueCoding and assign Category per ItemName_Sanitized

if 'ValueCoding' in globals() and ValueCoding is not None and not ValueCoding.empty:
    # Inform that processing of ValueCoding is starting
    print("Processing ValueCoding DataFrame...")

    # Work on a copy to avoid mutating the original ValueCoding
    value_coding_processed = ValueCoding.copy()

    # --- Sanitize ItemName ---
    # Ensure ItemName is treated as string (avoid issues with numbers / NaNs)
    value_coding_processed['ItemName'] = value_coding_processed['ItemName'].astype(str)

    # First pass: replace spaces and certain filesystem-unsafe characters with underscores
    # Same rule set as used for building filenames elsewhere
    value_coding_processed['ItemName_Sanitized'] = value_coding_processed['ItemName'].str.replace(
        r'[ \-/\:\\*\?"<>|]', '_', regex=True
    )
    # Second pass: strip any remaining characters not in [word chars, dot, hyphen]
    value_coding_processed['ItemName_Sanitized'] = value_coding_processed['ItemName_Sanitized'].str.replace(
        r'[^\w.-]', '', regex=True
    )

    # --- Normalize Source ---
    # Convert Source to string and strip leading/trailing whitespace
    value_coding_processed['Source'] = (
        value_coding_processed['Source']
        .astype(str)
        .str.strip()
    )

    # ------------------------------------------------------------------
    # Group all sources per sanitized name
    # ------------------------------------------------------------------
    # For each ItemName_Sanitized, collect the set of all non-null sources
    sources_per_name = (
        value_coding_processed
        .groupby('ItemName_Sanitized')['Source']
        .apply(lambda s: set(s.dropna()))
        .to_dict()
    )

    # ------------------------------------------------------------------
    # Helper to decide category per sanitized name
    # ------------------------------------------------------------------
    def decide_category(name, sources: set):
        """
        Decide a category string ("Mixed", "Annualized", "Special", or None)
        for a given sanitized item name based on its set of sources.
        """
        # Item-specific overrides (these take precedence over generic rules)
        if name == 'Depreciation_Depletion__Amortization':
            # Prefer 'IS' interpretation -> treat as Mixed
            return 'Mixed'
        if name == 'Minority_Interest':
            # Prefer 'BS' interpretation -> treat as Annualized
            return 'Annualized'

        # Generic rules:

        # If any of the sources is Income Statement or "Other", classify as Mixed
        if any(src in ['IS', 'Market'] for src in sources):
            return 'Mixed'

        # If any of the sources is Market or Balance Sheet, classify as Annualized
        if any(src in ['BS'] for src in sources):
            return 'Annualized'

        # If any of the sources is Cash Flow Statement, classify as Special
        if any(src in ['CFS'] for src in sources):
            return 'Special'

        # If none of the rules matched, leave as None (no clear mapping)
        return None

    # ------------------------------------------------------------------
    # Build category_map for all sanitized names
    # ------------------------------------------------------------------
    # Map each sanitized item name to a Category by applying decide_category
    category_map = {
        name: decide_category(name, srcs)
        for name, srcs in sources_per_name.items()
    }

    # Attach final Category back to each row, via ItemName_Sanitized
    value_coding_processed['Category'] = (
        value_coding_processed['ItemName_Sanitized'].map(category_map)
    )

    # ------------------------------------------------------------------
    # Derive annualized_items / mixed_items / special_items
    # as unique per sanitized name
    # ------------------------------------------------------------------
    annualized_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Annualized']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )
    mixed_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Mixed']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )
    special_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Special']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )

    # ------------------------------------------------------------------
    # Export to globals for use in later cells
    # ------------------------------------------------------------------
    globals()['value_coding_processed'] = value_coding_processed
    globals()['annualized_items'] = annualized_items
    globals()['mixed_items'] = mixed_items
    globals()['special_items'] = special_items
    globals()['category_map'] = category_map

    # ------------------------------------------------------------------
    # Display sample and counts
    # ------------------------------------------------------------------
    print("\nProcessed ValueCoding DataFrame (sample):")
    display(value_coding_processed.head())

    print(f"\nNumber of Annualized items: {len(annualized_items)}")
    print(f"Number of Mixed items: {len(mixed_items)}")
    print(f"Number of Special items: {len(special_items)}")

else:
    # If ValueCoding is not available or has no rows, skip processing
    print("ValueCoding DataFrame not found or is empty. Skipping processing.")


Processing ValueCoding DataFrame...

Processed ValueCoding DataFrame (sample):


Unnamed: 0,ItemCode,ItemName,Source,ItemName_Sanitized,Category
0,5006,Market Price Current,Market,Market_Price_Current,Mixed
1,5007,Market Price YTD High Current,Market,Market_Price_YTD_High_Current,Mixed
2,5008,Market Price YTD Low Current,Market,Market_Price_YTD_Low_Current,Mixed
3,5009,Date of Current Price,Market,Date_of_Current_Price,Mixed
4,5091,Market Price 52 Week High Current,Market,Market_Price_52_Week_High_Current,Mixed



Number of Annualized items: 148
Number of Mixed items: 141
Number of Special items: 62


### Sort into correct bucket

In [258]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# CELL 2 maps work_subset_*.txt files to categories ("Annualized", "Mixed", "Special")
# based on the ItemName_Sanitized that was derived in the previous cell.
#
# Steps:
#   1. Check that the required categorized DataFrames (annualized_items, mixed_items,
#      special_items) and the temporary directory path Temp_file_path_DP exist.
#   2. Build three sets of sanitized item names (annualized_names, mixed_names,
#      special_names) from those DataFrames.
#   3. List all files in Temp_file_path_DP and filter for those matching
#      "work_subset_*.txt".
#   4. For each work_subset file:
#        - Extract the sanitized item name from the filename.
#        - Determine whether it belongs to Mixed, Annualized, or Special based on
#          the sets created in step 2.
#        - Assign it a variable name (Mixed_n, Annualized_n, Special_n) and store
#          that mapping in dicts mixed_vars, annualized_vars, special_vars.
#   5. Store these dicts in globals() for use in later cells.
#   6. Print summary information and display the created dictionaries.
#   7. Perform garbage collection at the end.
#
# If any of the prerequisites are missing, it prints a message and skips the mapping.


# CELL 2 — Map work_subset files to categories using ItemName_Sanitized

if ('annualized_items' not in globals() or annualized_items is None or
    'mixed_items' not in globals() or mixed_items is None or
    'special_items' not in globals() or special_items is None or
    'Temp_file_path_DP' not in globals()):
    # If required DataFrames or directory path are missing, do not proceed
    print("Required DataFrames (annualized_items, mixed_items, special_items) or Temp_file_path_DP not found. Please run the categorization cell.")
else:
    print("Identifying work_subset files and creating variables based on categories...")

    # Sets of sanitized names that are final Annualized/Mixed/Special
    annualized_names = set(annualized_items['ItemName_Sanitized'].dropna())
    mixed_names      = set(mixed_items['ItemName_Sanitized'].dropna())
    special_names    = set(special_items['ItemName_Sanitized'].dropna())

    # ------------------------------------------------------------------
    # Get a list of work_subset files from the temp directory
    # ------------------------------------------------------------------
    temp_files = os.listdir(Temp_file_path_DP)
    work_subset_files = [
        f for f in temp_files
        if f.startswith('work_subset_') and f.endswith('.txt')
    ]

    # Dictionaries to hold mappings:
    #   "Annualized_1" -> "SomeItemName"
    #   "Mixed_1"      -> "AnotherItemName"
    #   "Special_1"    -> "SpecialItemName"
    annualized_vars = {}
    mixed_vars = {}
    special_vars = {}

    print(f"\nFound {len(work_subset_files)} work_subset files in Temp directory.")

    # Sort files to have deterministic order when assigning variable names
    work_subset_files.sort()

    # Counters for how many items fall into each category; used for variable suffixes
    annualized_count = 0
    mixed_count = 0
    special_count = 0

    # ------------------------------------------------------------------
    # Iterate over each work_subset file and map it to a category
    # ------------------------------------------------------------------
    for file_name in work_subset_files:
        # Extract sanitized item name from filename, expecting "work_subset_<name>.txt"
        match = re.match(r'work_subset_(.+)\.txt$', file_name)
        if not match:
            print(f"  Filename format not as expected for '{file_name}'. Skipping processing.")
            continue

        sanitized_item_name = match.group(1)

        # Use the resolved sets. No more ambiguous precedence:
        # priority Mixed -> Annualized -> Special, in this order of checks.
        if sanitized_item_name in mixed_names:
            mixed_count += 1
            var_name = f"Mixed_{mixed_count}"
            mixed_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Mixed (variable '{var_name}').")

        elif sanitized_item_name in annualized_names:
            annualized_count += 1
            var_name = f"Annualized_{annualized_count}"
            annualized_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Annualized (variable '{var_name}').")

        elif sanitized_item_name in special_names:
            special_count += 1
            var_name = f"Special_{special_count}"
            special_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Special (variable '{var_name}').")

        else:
            # No category mapping found for this sanitized name
            print(f"  '{file_name}' -> No matching Category (might be unmapped or ambiguous). Skipping.")

    # ------------------------------------------------------------------
    # Expose the mapping dictionaries globally for later use
    # ------------------------------------------------------------------
    globals()['annualized_vars'] = annualized_vars
    globals()['mixed_vars'] = mixed_vars
    globals()['special_vars'] = special_vars

    # ------------------------------------------------------------------
    # Summary output and inspection
    # ------------------------------------------------------------------
    print(f"\nVariable creation complete.")
    print(f"Created {len(annualized_vars)} Annualized variables.")
    print(f"Created {len(mixed_vars)} Mixed variables.")
    print(f"Created {len(special_vars)} Special variables.")

    print("\nAnnualized Variables:")
    display(annualized_vars)

    print("\nMixed Variables:")
    display(mixed_vars)

    print("\nSpecial Variables:")
    display(special_vars)

    # Run garbage collection after building mappings
    gc.collect()


Identifying work_subset files and creating variables based on categories...

Found 49 work_subset files in Temp directory.
  'work_subset_Accounts_Payable.txt' -> Annualized (variable 'Annualized_1').
  'work_subset_Capital_Expenditures_Addtns_to_Fixed_Assets.txt' -> Special (variable 'Special_1').
  'work_subset_Cash_Dividends_Paid___Total.txt' -> Special (variable 'Special_2').
  'work_subset_Cash__Short_Term_Investments.txt' -> Annualized (variable 'Annualized_2').
  'work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc..txt' -> Special (variable 'Special_3').
  'work_subset_Common_Equity.txt' -> Annualized (variable 'Annualized_3').
  'work_subset_Cost_of_Goods_Sold_Excl_Depreciation.txt' -> Mixed (variable 'Mixed_1').
  'work_subset_Current_Assets___Total.txt' -> Annualized (variable 'Annualized_4').
  'work_subset_Current_Liabilities___Total.txt' -> Annualized (variable 'Annualized_5').
  'work_subset_Deferred_Taxes.txt' -> Annualized (variable 'Annualized_6').
  'work_subset_Depre

{'Annualized_1': 'Accounts_Payable',
 'Annualized_2': 'Cash__Short_Term_Investments',
 'Annualized_3': 'Common_Equity',
 'Annualized_4': 'Current_Assets___Total',
 'Annualized_5': 'Current_Liabilities___Total',
 'Annualized_6': 'Deferred_Taxes',
 'Annualized_7': 'Income_Taxes_Payable',
 'Annualized_8': 'Inventories___Total',
 'Annualized_9': 'Investments_in_Associated_Companies',
 'Annualized_10': 'Investments_in_Sales__Direct_Financing_Leases',
 'Annualized_11': 'Long_Term_Debt',
 'Annualized_12': 'Long_Term_Receivables',
 'Annualized_13': 'Minority_Interest',
 'Annualized_14': 'Other_Assets___Total',
 'Annualized_15': 'Other_Current_Assets',
 'Annualized_16': 'Other_Current_Liabilities',
 'Annualized_17': 'Other_Investments',
 'Annualized_18': 'Other_Liabilities',
 'Annualized_19': 'Preferred_Stock',
 'Annualized_20': 'Property_Plant__Equipment___Net',
 'Annualized_21': 'ReceivablesNet',
 'Annualized_22': 'Short_Term_Debt__Current_Portion_of_LT_Debt',
 'Annualized_23': 'Total_Assets'


Mixed Variables:


{'Mixed_1': 'Cost_of_Goods_Sold_Excl_Depreciation',
 'Mixed_2': 'Depreciation_Depletion__Amortization',
 'Mixed_3': 'Earnings_Per_Share_Fiscal_Year_End',
 'Mixed_4': 'Income_Taxes',
 'Mixed_5': 'Interest_Expense___Total',
 'Mixed_6': 'Net_Income_Before_Extra_Items_Preferred_Divs',
 'Mixed_7': 'Net_Income_Used_to_Calculate_Basic_EPS',
 'Mixed_8': 'Net_Sales_or_Revenues',
 'Mixed_9': 'Operating_Income',
 'Mixed_10': 'Sales_Per_Share',
 'Mixed_11': 'Selling_General__Administrative_Expenses'}


Special Variables:


{'Special_1': 'Capital_Expenditures_Addtns_to_Fixed_Assets',
 'Special_2': 'Cash_Dividends_Paid___Total',
 'Special_3': 'Com_Pfd_Redeemed_Retired_Converted_Etc.',
 'Special_4': 'Disposal_of_Fixed_Assets',
 'Special_5': 'Extraordinary_Items',
 'Special_6': 'Funds_From_For_Other_Operating_Activities',
 'Special_7': 'Funds_From_Operations',
 'Special_8': 'Long_Term_Borrowings',
 'Special_9': 'Net_Cash_Flow___Financing',
 'Special_10': 'Net_Cash_Flow___Investing',
 'Special_11': 'Net_Cash_Flow___Operating_Activities',
 'Special_12': 'Net_Proceeds_From_Sale_Issue_of_Com__Pfd',
 'Special_13': 'Reduction_in_Long_Term_Debt'}

## 4.3. Cash Flow Statement

### Special 1

#### Set Index

In [259]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 1  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_1  ->  ItemName: 'Capital_Expenditures_Addtns_to_Fixed_Assets'


#### Import relevant data



In [260]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Capital_Expenditures_Addtns_to_Fixed_Assets' ...
Full dataset loaded successfully: 1,956,042 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4601,24.714956
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4601,63.338554
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4601,85.214171
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4601,106.86508
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4601,53.811107


0

#### Encode Frequency Code (Check of output required!)

In [261]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Capital_Expenditures_Addtns_to_Fixed_Assets' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4601,24.714956,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4601,63.338554,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4601,85.214171,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4601,106.86508,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4601,53.811107,Y96


#### Annualize data with most recent information (Check of output required!)

In [262]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,956,042 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1338239
                mean: 23398.703863129445
              median: 100.0
winsorized_mean_1pct: 99.66793694815519
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 18,321

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1319918
                mean: 100.04664907447778
              median: 100.0
winsorized_mean_1p

### Special 2

#### Set Index

In [263]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 2  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_2  ->  ItemName: 'Cash_Dividends_Paid___Total'


#### Import relevant data



In [264]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Cash_Dividends_Paid___Total' ...
Full dataset loaded successfully: 1,787,365 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4551,39.696982
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4551,9.838
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4551,7.402716
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4551,0.0
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4551,0.0


0

#### Encode Frequency Code (Check of output required!)

In [265]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Cash_Dividends_Paid___Total' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4551,39.696982,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4551,9.838,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4551,7.402716,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4551,0.0,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4551,0.0,Y96


#### Annualize data with most recent information (Check of output required!)

In [266]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,787,365 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 693677
                mean: 24674.33980832419
              median: 100.0
winsorized_mean_1pct: 97.72495743899829
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 16,187

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 677490
                mean: 100.10068114999407
              median: 100.0
winsorized_mean_1pct:

### Special 3

#### Set Index

In [267]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 3  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_3  ->  ItemName: 'Com_Pfd_Redeemed_Retired_Converted_Etc.'


#### Import relevant data



In [268]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Com_Pfd_Redeemed_Retired_Converted_Etc.' ...
Full dataset loaded successfully: 1,447,068 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3.0,1992,December,4751,0.0
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3.0,1993,December,4751,0.0
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3.0,1994,December,4751,0.0
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-31,A,,1995,December,4751,0.0
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3.0,1996,December,4751,0.0


0

#### Encode Frequency Code (Check of output required!)

In [269]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Com_Pfd_Redeemed_Retired_Converted_Etc.' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3.0,1992,December,4751,0.0,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3.0,1993,December,4751,0.0,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3.0,1994,December,4751,0.0,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-31,A,,1995,December,4751,0.0,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3.0,1996,December,4751,0.0,Y96


#### Annualize data with most recent information (Check of output required!)

In [270]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,447,068 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 213288
                mean: 2619.227878871928
              median: 100.0
winsorized_mean_1pct: 96.70892479603569
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 7,290

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 205998
                mean: 99.99207121506225
              median: 100.0
winsorized_mean_1pct: 9

### Special 4

#### Set Index

In [271]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 4  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_4  ->  ItemName: 'Disposal_of_Fixed_Assets'


#### Import relevant data



In [272]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Disposal_of_Fixed_Assets' ...
Full dataset loaded successfully: 1,484,067 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4351,0.0
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4351,0.0
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4351,0.0
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4351,0.0
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4351,0.0


0

#### Encode Frequency Code (Check of output required!)

In [273]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Disposal_of_Fixed_Assets' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4351,0.0,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4351,0.0,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4351,0.0,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4351,0.0,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4351,0.0,Y96


#### Annualize data with most recent information (Check of output required!)

In [274]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,484,067 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 741801
                mean: 26126.07958369098
              median: 100.0
winsorized_mean_1pct: 97.76615146707655
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 26,353

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 715448
                mean: 99.8558697048265
              median: 100.0
winsorized_mean_1pct: 9

### Special 5

#### Set Index

In [275]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 5  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_5  ->  ItemName: 'Extraordinary_Items'


#### Import relevant data



In [276]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Extraordinary_Items' ...
Full dataset loaded successfully: 1,503,226 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4225,0.0
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4225,-8.619558
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4225,-25.858138
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4225,-83.558991
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4225,-13.904


0

#### Encode Frequency Code (Check of output required!)

In [277]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Extraordinary_Items' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4225,0.0,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4225,-8.619558,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4225,-25.858138,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4225,-83.558991,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4225,-13.904,Y96


#### Annualize data with most recent information (Check of output required!)

In [278]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,503,226 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 5663
                mean: 91651.86503264453
              median: 100.0
winsorized_mean_1pct: 96.70751525972483
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 194

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 5469
                mean: 100.03079414107599
              median: 100.0
winsorized_mean_1pct: 100.0


### Special 6

#### Set Index

In [279]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 6  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_6  ->  ItemName: 'Funds_From_For_Other_Operating_Activities'


#### Import relevant data



In [280]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Funds_From_For_Other_Operating_Activities' ...
Full dataset loaded successfully: 1,694,729 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4831,-87.320244
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4831,200.312706
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4831,47.756416
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4831,361.917728
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4831,246.214426


0

#### Encode Frequency Code (Check of output required!)

In [281]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Funds_From_For_Other_Operating_Activities' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4831,-87.320244,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4831,200.312706,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4831,47.756416,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4831,361.917728,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4831,246.214426,Y96


#### Annualize data with most recent information (Check of output required!)

In [282]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,694,729 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1218296
                mean: 18445.996633235434
              median: 100.0
winsorized_mean_1pct: 98.49069658802186
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 40,085

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1178211
                mean: 100.29888366758064
              median: 100.0
winsorized_mean_1p

### Special 7

#### Set Index

In [283]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 7  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_7  ->  ItemName: 'Funds_From_Operations'


#### Import relevant data



In [284]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Funds_From_Operations' ...
Full dataset loaded successfully: 2,433,080 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4201,70.253522
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4201,240.925386
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4201,317.659199
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4201,112.426119
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4201,129.039


0

#### Encode Frequency Code (Check of output required!)

In [285]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Funds_From_Operations' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4201,70.253522,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4201,240.925386,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4201,317.659199,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4201,112.426119,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4201,129.039,Y96


#### Annualize data with most recent information (Check of output required!)

In [286]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 2,433,080 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1778807
                mean: 18071.225460238547
              median: 100.0
winsorized_mean_1pct: 98.8033106519789
                 p10: 92.66924215631829
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 101.17604413937995

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 65,076

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1713731
                mean: 100.09592602971993
              median: 

### Special 8

#### Set Index

In [287]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 8  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_8  ->  ItemName: 'Long_Term_Borrowings'


#### Import relevant data



In [288]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Long_Term_Borrowings' ...
Full dataset loaded successfully: 1,381,622 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4401,0.0
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4401,90.276215
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4401,0.0
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4401,0.0
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4401,0.0


0

#### Encode Frequency Code (Check of output required!)

In [289]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Long_Term_Borrowings' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4401,0.0,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4401,90.276215,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4401,0.0,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4401,0.0,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4401,0.0,Y96


#### Annualize data with most recent information (Check of output required!)

In [290]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,381,622 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 615196
                mean: 19679.64721642216
              median: 100.0
winsorized_mean_1pct: 97.67438845014577
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 16,579

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 598617
                mean: 99.88085057765244
              median: 100.0
winsorized_mean_1pct: 

### Special 9

#### Set Index

In [291]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 9  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_9  ->  ItemName: 'Net_Cash_Flow___Financing'


#### Import relevant data



In [292]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Net_Cash_Flow___Financing' ...
Full dataset loaded successfully: 1,975,430 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1994,June,4890,40.829043
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1995,June,4890,-45.73678
2,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-11-01,A,3,1996,June,4890,-12.83298
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1997-10-31,A,3,1997,June,4890,72.094518
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1998-12-04,A,3,1998,September,4890,-55.529309


0

#### Encode Frequency Code (Check of output required!)

In [293]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Net_Cash_Flow___Financing' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1994,June,4890,40.829043,Y94
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1995,June,4890,-45.73678,Y95
2,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-11-01,A,3,1996,June,4890,-12.83298,Y96
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1997-10-31,A,3,1997,June,4890,72.094518,Y97
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1998-12-04,A,3,1998,September,4890,-55.529309,Y98


#### Annualize data with most recent information (Check of output required!)

In [294]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,975,430 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1334356
                mean: 22951.9780547885
              median: 100.0
winsorized_mean_1pct: 98.81137799681824
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 25,573

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1308783
                mean: 100.08163361124764
              median: 100.0
winsorized_mean_1pct

### Special 10

#### Set Index

In [295]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 10  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_10  ->  ItemName: 'Net_Cash_Flow___Investing'


#### Import relevant data



In [296]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Net_Cash_Flow___Investing' ...
Full dataset loaded successfully: 1,979,139 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1994,June,4870,47.224268
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1995,June,4870,28.910173
2,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-11-01,A,3,1996,June,4870,33.23571
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1997-10-31,A,3,1997,June,4870,38.539571
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1998-12-04,A,3,1998,September,4870,41.47087


0

#### Encode Frequency Code (Check of output required!)

In [297]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Net_Cash_Flow___Investing' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1994,June,4870,47.224268,Y94
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1995,June,4870,28.910173,Y95
2,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-11-01,A,3,1996,June,4870,33.23571,Y96
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1997-10-31,A,3,1997,June,4870,38.539571,Y97
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1998-12-04,A,3,1998,September,4870,41.47087,Y98


#### Annualize data with most recent information (Check of output required!)

In [298]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,979,139 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1339411
                mean: 21564.019765097186
              median: 100.0
winsorized_mean_1pct: 99.09259777790926
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 22,262

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1317149
                mean: 100.06684577111258
              median: 100.0
winsorized_mean_1p

### Special 11

#### Set Index

In [299]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 11  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_11  ->  ItemName: 'Net_Cash_Flow___Operating_Activities'


#### Import relevant data



In [300]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Net_Cash_Flow___Operating_Activities' ...
Full dataset loaded successfully: 2,052,576 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1994,June,4860,3.574132
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1995,June,4860,66.404052
2,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-11-01,A,3,1996,June,4860,41.129327
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1997-10-31,A,3,1997,June,4860,-15.875458
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1998-12-04,A,3,1998,September,4860,111.195883


0

#### Encode Frequency Code (Check of output required!)

In [301]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Net_Cash_Flow___Operating_Activities' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1994,June,4860,3.574132,Y94
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1995,June,4860,66.404052,Y95
2,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-11-01,A,3,1996,June,4860,41.129327,Y96
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1997-10-31,A,3,1997,June,4860,-15.875458,Y97
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1998-12-04,A,3,1998,September,4860,111.195883,Y98


#### Annualize data with most recent information (Check of output required!)

In [302]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 2,052,576 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1411558
                mean: 20985.548626357453
              median: 100.0
winsorized_mean_1pct: 99.251528170489
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 23,166

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1388392
                mean: 100.13877387747178
              median: 100.0
winsorized_mean_1pct

### Special 12

#### Set Index

In [303]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 12  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_12  ->  ItemName: 'Net_Proceeds_From_Sale_Issue_of_Com__Pfd'


#### Import relevant data



In [304]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Net_Proceeds_From_Sale_Issue_of_Com__Pfd' ...
Full dataset loaded successfully: 1,611,038 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4251,0.0
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4251,0.0
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4251,0.0
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4251,0.0
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4251,0.0


0

#### Encode Frequency Code (Check of output required!)

In [305]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Net_Proceeds_From_Sale_Issue_of_Com__Pfd' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4251,0.0,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4251,0.0,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4251,0.0,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4251,0.0,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4251,0.0,Y96


#### Annualize data with most recent information (Check of output required!)

In [306]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,611,038 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 507219
                mean: 5393.4765422721775
              median: 100.0
winsorized_mean_1pct: 96.46907483123637
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 18,973

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 488246
                mean: 100.00050179781469
              median: 100.0
winsorized_mean_1pct

### Special 13

#### Set Index

In [307]:
# =============================================================================
# SELECT A SINGLE SPECIAL_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Chooses which Special_* item (from special_vars) should be processed.
#   2. Validates that special_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Sets a base_output_filename for downstream output files.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Adjust `special_index` to run a different Special_* dataset (e.g., 2, 3, 10 ...).
#   - Assumes `special_vars` was created in the categorization step and
#     `Temp_file_path_DP` was defined in the environment setup.

# === Select which Special_* item to run ===
special_index = 13  # Change this to run another dataset, e.g. 10

# special_vars should look like: {'Special_1': 'SomeItem', 'Special_2': 'OtherItem', ...}
assert 'special_vars' in globals(), "special_vars dict not found in globals()."

# Build the key for the chosen index and find the corresponding item name
item_key = f"Special_{special_index}"
target_item_name = special_vars.get(item_key)
assert target_item_name, f"{item_key} not found in special_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing your globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file for this item (produced by previous merging steps)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for output files created by the "Special" pipeline
base_output_filename = f"Special_{target_item_name}_complete"

# Make sure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Special_13  ->  ItemName: 'Reduction_in_Long_Term_Debt'


#### Import relevant data



In [308]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    special_raw = import_file_to_dataframe(file_path)

    if special_raw is not None and not special_raw.empty:
        print(f"Full dataset loaded successfully: {len(special_raw):,} rows total.")
        try:
            display(special_raw.head())
        except Exception:
            print(special_raw.head().to_string(index=False))
    else:
        print("Dataset appears empty or could not be loaded.")
        special_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    special_raw = pd.DataFrame()

gc.collect()



Importing full dataset for Item: 'Reduction_in_Long_Term_Debt' ...
Full dataset loaded successfully: 1,478,266 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4701,0.615867
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4701,0.0
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4701,0.0
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4701,0.0
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4701,0.0


0

#### Encode Frequency Code (Check of output required!)

In [309]:
# =============================================================================
# SUMMARY
# =============================================================================
# (unchanged documentation)
# ...

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    if pd.isna(n):
        return None
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    # (function identical to your version—no changes needed)
    df = df.copy()
    df["Frequency"] = df["Frequency"].str.upper().fillna("")
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    df["Str_FiscalPrd"] = np.nan

    q_part = ((fp % 4) + 1).where(m_quarter)
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna('')
    )

    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    fs_sem  = ((fp % 2) + 1).where(m_FS)
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna('')
    )

    t_term  = ((fp % 3) + 1).where(m_KTLU)
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna('')
    )

    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce')) |
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    return df


# =============================================================================
# Driver: apply encoding to special_raw if present and non-empty
# =============================================================================
if 'special_raw' in globals() and special_raw is not None and not special_raw.empty:
    print(f"Applying encoding to Special dataset for '{target_item_name}' ...")
    special_encoded = add_str_fiscalprd(special_raw)
    display(special_encoded.head())
else:
    print("special_raw not found or empty. Cannot perform encoding.")
    special_encoded = None


Applying encoding to Special dataset for 'Reduction_in_Long_Term_Debt' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,4701,0.615867,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,4701,0.0,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,4701,0.0,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,4701,0.0,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,4701,0.0,Y96


#### Annualize data with most recent information (Check of output required!)

In [310]:
# @title
# =============================================================================
# SUMMARY
# =============================================================================
# This script:
#   - Implements a fast "as-of join" between two DataFrames based on PIT dates
#     and key columns (asof_numpy).
#   - Provides helpers for percentile summaries and winsorized statistics.
#   - Builds annualized "AnnPITValue" values from:
#       * true annual data (A/B frequency) and
#       * sums of quarterly data (Q1..Q4) when available.
#   - Performs various quality checks (future-dated values, extreme percentages).
#   - Produces a processed "special_processed" DataFrame and saves:
#       * a full export and
#       * a subset export.
#   - Prints a row-accounting overview so drops and exclusions are transparent.
# =============================================================================


# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------
def _key(fr, cols):
    """
    Build a combined string key from multiple columns by concatenating them
    with '||' to use as a group key.
    """
    # Convert all key columns to string and join them row-wise with '||'
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    Returns
    -------
    out_vals : np.ndarray
        Array of float values (same length as left_df) containing the matched
        values from right_df (or NaN if none found).
    out_dates : np.ndarray
        Array of datetime64 values containing the matched dates (or NaT).
    """
    # Initialize output arrays with NaN/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns in left/right for the as-of join
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask to filter rows with all required fields present
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If no valid rows on either side, return empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on copies of the filtered frames
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT dates to daily granularity
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build grouping keys on both sides
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and PIT Date so we can binary-search later
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract numpy arrays for fast vectorized operations
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Determine contiguous slices of rows for each unique key in right_df
    uniq, first = np.unique(rk, return_index=True)
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]                         # start index for this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end index
        slices[k] = (rdt[s:e], rval[s:e])    # store date and value slices

    # Original indices of left rows (to write back results correctly)
    l_idx = l.index.to_numpy()
    # Keys and dates of left rows
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left rows by key (stable sort) for block processing
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Iterate over blocks of the same key in left_df
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current key
        j = i + 1
        # Find the end of this key's block
        while j < n and sk[j] == k:
            j += 1

        # Block of PIT dates and corresponding positions (indices) for this key
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # Only process if the key exists in the right-hand slices
        if k in slices:
            r_dates, r_vals = slices[k]

            # For each left PIT date, find index of right PIT <= left PIT
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # only those with at least one right date <= left date

            # Write results back to output arrays
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]

        # Move to the next block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------
def pctile(s, q):
    """
    Safe percentile (quantile) helper that returns NaN on errors.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics and selected percentiles for a numeric series.
    """
    # Replace +/-inf with NaN and drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}

    # FIX APPLIED HERE: .to_numpy().copy() ensures winsorize gets a writable array
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()

    # Compute mean, median, winsorized mean and deciles
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Period prioritization ----------
# Priority ranking for period labels when deciding between multiple candidates
_PERIOD_PRIORITY = {
    'A': 100,  # Full annual has highest priority
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map column names to period labels used in _PERIOD_PRIORITY.
    Currently only special-cases 'A'.
    """
    return 'A' if colname == 'A' else colname


# ---------- Helpers for AnnPITValue using A + Q1..Q4 sum ----------
def full_year_from_quarters(row, pit, cutoff):
    """
    Build a full-year candidate from Q1..Q4:

      - Requires ALL Q1..Q4 to have:
          * non-missing value
          * non-missing date
          * non-missing OriginFP
      - All dates must be within [cutoff, pit].
      - origin_fp = max(OriginFP of Q1..Q4) (i.e., newest year among quarters)

    Parameters
    ----------
    row : pd.Series
        Row from the working DataFrame.
    pit : datetime-like
        PIT Date of the row.
    cutoff : datetime-like
        Lower bound for valid quarter dates (pit - 365 days).

    Returns
    -------
    (dt, val_sum, origin_fp) or (NaT, NaN, NaN)
        dt        : latest quarter date among Q1..Q4
        val_sum   : sum of Q1..Q4 values
        origin_fp : max OriginFP among Q1..Q4
    """
    labels = [f'Q{i}' for i in range(1, 5)]
    vals, dts, fps = [], [], []

    # Check all four quarters
    for lbl in labels:
        v = row.get(lbl, np.nan)
        d = row.get(f'{lbl}_Date', pd.NaT)
        o = row.get(f'{lbl}_OriginFP', np.nan)

        # Require non-missing value, date, and OriginFP
        if pd.isna(v) or pd.isna(d) or pd.isna(o):
            return pd.NaT, np.nan, np.nan

        # Ensure Date is valid and within [cutoff, pit]
        d = pd.to_datetime(d, errors='coerce')
        if pd.isna(d) or not (cutoff <= d <= pit):
            return pd.NaT, np.nan, np.nan

        vals.append(float(v))
        dts.append(d)
        fps.append(int(o))

    # If all checks pass, compute sum, latest date, and max OriginFP
    total_val = float(np.nansum(vals))
    latest_dt = max(dts)
    origin_fp = max(fps)
    return latest_dt, total_val, origin_fp


def pick_annpit_sum_with_origin(row):
    """
    Compute AnnPITValue using annual (A) and quarterly (Q1..Q4) data.
    """
    pit = row['PIT Date']
    # If PIT Date is missing, no AnnPITValue can be computed
    if pd.isna(pit):
        return np.nan

    # Lower bound for acceptable dates (365 days before PIT)
    # This works now because timedelta is imported
    cutoff = pit - timedelta(days=365)

    # Extract fiscal period as integer if possible
    fp = row.get('FiscalPeriod', np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    # Collect candidate tuples as:
    # (label, period_priority, date, value, origin_fp)
    candidates = []

    # --- A: actual annual (0 is allowed) ---
    A_val = row.get('A', np.nan)
    A_dt  = row.get('A_Date', pd.NaT)
    A_ofp = row.get('A_OriginFP', np.nan)
    if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
        A_dt = pd.to_datetime(A_dt, errors='coerce')
        if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
            candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

    # --- Q4 candidate: sum of Q1..Q4 (0 is allowed) ---
    q4_dt, q4_val, q4_fp = full_year_from_quarters(row, pit, cutoff)
    if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
        candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

    # No candidates at all -> return NaN
    if not candidates:
        return np.nan

    # Filter only those with non-NaN values (0 is allowed)
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3])]

    # Relative year relation between candidate OriginFP and the row's FiscalPeriod
    def rel(c):
        _, _, _, _, ofp = c
        if fp_int is None or ofp is None:
            return 'unknown'
        if ofp == fp_int:
            return 'same'
        if ofp == fp_int - 1:
            return 'prior'
        return 'other'

    # 1) Same-year A
    same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
    if same_A:
        # Pick latest A by date
        best = max(same_A, key=lambda x: x[2])
        return best[3]

    # 2) Same-year Q4 sum
    same_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'same')
    if same_Q4:
        # Higher priority first, then latest date
        best = max(same_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 3) Prior-year A
    prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
    if prior_A:
        best = max(prior_A, key=lambda x: x[2])
        return best[3]

    # 4) Prior-year Q4 sum
    prior_Q4 = valid(c for c in candidates if c[0] == 'Q4' and rel(c) == 'prior')
    if prior_Q4:
        best = max(prior_Q4, key=lambda x: (x[1], x[2]))
        return best[3]

    # 5) Fallback: any candidate (other/unknown) by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return best[3]

    # If all else fails (should not normally happen), return 0.0
    return 0.0


# ============================ MAIN ============================
if 'special_encoded' in globals() and special_encoded is not None:
    # Remember the number of input rows for row-accounting
    input_rows = len(special_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy so we do not mutate the original DataFrame
    working = special_encoded.copy()

    # Exclude certain frequencies (E, L, R, U) from further processing
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # Convert key columns to appropriate types
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Ensure some ID-like columns are strings
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # Parse Q/S/T numbers from Str_FiscalPrd (e.g. 'Q1Y23' -> QNUM=1)
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # Define all period/value and period/date column names
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']
    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1,5)] + \
                                       [f'S{i}' for i in range(1,3)] + \
                                       [f'T{i}' for i in range(1,4)]] + ['A_Date']

    # Ensure all period value columns exist (initialize if missing)
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan

    # Ensure all period date columns exist (initialize if missing)
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    # Keys used to identify time series in as-of joins
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) TrueValue from annuals: build reference "TrueValue" per ID/FiscalPeriod
    # -------------------------------------------------------------------------
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Merge TrueValue back on keys
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping for each frequency (no prior-year / no forward-fill)
    # -------------------------------------------------------------------------

    # Annual (A/B) as-of
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period for A
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # Quarterly (Q/C) as-of, by quarter number
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Subset source to a specific quarter
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d

        # Set OriginFP where we have a newly filled quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Semiannual (S/F) as-of, by half-year number
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # Trimester (T/K) as-of, by term number
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d

        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels and normalize dates (only as-of results, no ffill)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    # Ensure all date columns are valid datetimes at day precision
    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) AnnPITValue with new logic (A + Q1..Q4 sum, zeros allowed)
    # -------------------------------------------------------------------------
    working['AnnPITValue'] = working.apply(
        pick_annpit_sum_with_origin,
        axis=1
    )

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check (period date > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Only use date columns that actually exist
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    # For each period date column, check if it's after PIT Date
    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        # Count violations per column
        viol_counts[c] = int(m.sum())
        # Track rows with any violation across all period dates
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flag rows with at least one future-date violation
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct and quality filter
    # -------------------------------------------------------------------------
    # Percentage of AnnPITValue relative to TrueValue (%)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Summary BEFORE dropping outliers
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    # Flag infinities
    is_inf = np.isinf(pct)
    # Flag finite out-of-range values outside [25, 250]
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 250) | (pct < 25))
    # Combined drop mask: infinities or out-of-range finite values
    to_drop_quality = is_inf | out_of_range

    # Count dropped rows due to quality rules
    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >250 or <25): {dropped_quality_rows:,}")

    # Keep only rows that pass the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Summary AFTER dropping outliers
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns and cleanup
    # -------------------------------------------------------------------------
    # Core columns that describe each row
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Period-related columns (Dates and Values)
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Columns we want to keep in the final output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns such as OriginFP and intermediate numeric helpers
    drop_cols = [c for c in working.columns
                 if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Reorder to the final column set
    special_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 8) Save outputs (requires Temp_file_path_DP and base_output_filename)
    # -------------------------------------------------------------------------
    # Both variables must be defined in a previous setup cell
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Full export path and write to pipe-delimited text
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    special_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Subset export with a small selection of columns
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency", "FiscalPeriod", "AnnPITValue"]
    subset_cols_existing = [col for col in subset_cols if col in special_processed.columns]
    subset_df = special_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df  # free some memory

    # -------------------------------------------------------------------------
    # 9) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(special_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")

    # Sanity check: excluded + dropped + final should equal original
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows.")

    # Optional: trigger garbage collection (import gc must exist elsewhere)
    gc.collect()

else:
    # Early exit if special_encoded is not defined or is None
    print("special_encoded not found or None; skipping.")

Input dataset contains 1,478,266 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 833668
                mean: 15650.415999260122
              median: 100.0
winsorized_mean_1pct: 97.73801315381671
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >250 or <25): 25,406

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 808262
                mean: 99.7696399239103
              median: 100.0
winsorized_mean_1pct: 