### Mount Google Drive, Import Libraries and Define Paths

In [160]:
# =============================================================================
# ENVIRONMENT SETUP + PATH CONFIGURATION (SERVER / COLAB COMPATIBLE)
# =============================================================================

import os
import sys
import importlib
from pathlib import Path
import string
import re
import gc
import pandas as pd

# -----------------------------------------------------------------------------
# 0) HARD SAFETY: cap native thread usage (prevents pthread_create EAGAIN)
#    MUST be set before importing numpy / scipy / pandas
# -----------------------------------------------------------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["BLIS_NUM_THREADS"] = "1"

# -----------------------------------------------------------------------------
# 1) Detect environment
# -----------------------------------------------------------------------------
IN_COLAB = "google.colab" in sys.modules

# -----------------------------------------------------------------------------
# 2) (Colab only) Mount Google Drive
# -----------------------------------------------------------------------------
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    BASE_PATH = "/content/drive/MyDrive/Colab Notebooks"
else:
    # Server base path (your target)
    BASE_PATH = "/home/jovyan/work/hpool1/pseidel/test"

print("IN_COLAB:", IN_COLAB)
print("BASE_PATH:", BASE_PATH)

# -----------------------------------------------------------------------------
# 3) Sanity checks: path exists + write permission
# -----------------------------------------------------------------------------
BASE = Path(BASE_PATH)
if not BASE.exists():
    raise FileNotFoundError(f"BASE_PATH does not exist: {BASE}")

# quick write test (fails fast if you don't have permissions)
test_file = BASE / ".write_test_tmp"
try:
    test_file.write_text("ok", encoding="utf-8")
    test_file.unlink()
except Exception as e:
    raise PermissionError(f"No write permission in {BASE}. Error: {e}")

# -----------------------------------------------------------------------------
# 4) Environment check: ensure required packages import cleanly
# -----------------------------------------------------------------------------
required_packages = ["numpy", "scipy", "pandas", "linearmodels", "xlsxwriter"]

for pkg in required_packages:
    print(f"Importing {pkg} ...")
    importlib.import_module(pkg)
    print(f"{pkg} OK")

# -----------------------------------------------------------------------------
# 5) Base paths and input/output locations
# -----------------------------------------------------------------------------
Input_file_path   = str(BASE / "Input")
Temp_file_path    = str(BASE / "Temp")
Output_file_path  = str(BASE / "Output")

Fundamentals_file_path = f"{Input_file_path}/WSFV_f_20250131.txt"
Current_file_path      = f"{Input_file_path}/WSCurrent_f_20250131.txt"
Calendar_file_path     = f"{Input_file_path}/WSCalendarPrd_f_20250131.txt"
Meta_file_path         = f"{Input_file_path}/WSMetaData_f_20250131.txt"
Excel_file_path        = f"{Input_file_path}/WS PIT Table Definitions V5 with start dates.xls"

MarketValues_file_path          = f"{Input_file_path}/Daily MV USD"
MarketValues_file_path_LC       = f"{Input_file_path}/Daily MV LC"
DailyTotalReturns_file_path     = f"{Input_file_path}/Daily Returns USD"
DailyIndexReturns_file_path     = f"{Input_file_path}/Daily Index Returns USD"
Constituents_file_path          = f"{Input_file_path}/Constituents.01.csv"
UniversalMatching_file_path     = f"{Input_file_path}/Universal Matching File"

Temp_file_path_GO  = f"{Temp_file_path}/TempGeneralOverview"
Temp_file_path_EoC = f"{Temp_file_path}/TempExtractionofCharacteristics"
Temp_file_path_DP  = f"{Temp_file_path}/TempDataPreparation"
Temp_file_path_A   = f"{Temp_file_path}/TempAnomalies"
Temp_file_path_R   = f"{Temp_file_path}/TempRegressionModel"

Relevant_items_path   = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_A = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_B = f"{Input_file_path}/RelevantItemsB.txt"
Relevant_items_path_C = f"{Input_file_path}/RelevantItemsC.txt"
Relevant_items_path_D = f"{Input_file_path}/RelevantItemsD.txt"

Subset_file_path = f"{Temp_file_path_GO}/Subsets"
Fundamentals_clean_file_path = f"{Temp_file_path_GO}/Fundamentals_clean.txt"
Current_clean_file_path      = f"{Temp_file_path_GO}/Current_clean.txt"
Calendar_clean_file_path     = f"{Temp_file_path_GO}/Input/Calendar_clean.txt"
Meta_clean_file_path         = f"{Temp_file_path_GO}/Input/Meta_clean.txt"

# -----------------------------------------------------------------------------
# 6) Ensure required directories exist
# -----------------------------------------------------------------------------
Path(Output_file_path).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_GO).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_EoC).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_A).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_R).mkdir(parents=True, exist_ok=True)
Path(Subset_file_path).mkdir(parents=True, exist_ok=True)
Path(Path(Calendar_clean_file_path).parent).mkdir(parents=True, exist_ok=True)

# -----------------------------------------------------------------------------
# 7) Streaming / deduplication settings
# -----------------------------------------------------------------------------
CHUNK_SIZE = 2_000_000
DATE_COL = "PIT Date"
DEDUP_KEYS = ["ID", "ItemCode", DATE_COL]

print("Paths configured. Temp outputs ->", Temp_file_path_GO)
print("Example input path ->", Fundamentals_file_path)


IN_COLAB: False
BASE_PATH: /home/jovyan/work/hpool1/pseidel/test
Importing numpy ...
numpy OK
Importing scipy ...
scipy OK
Importing pandas ...
pandas OK
Importing linearmodels ...
linearmodels OK
Importing xlsxwriter ...
xlsxwriter OK
Paths configured. Temp outputs -> /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview
Example input path -> /home/jovyan/work/hpool1/pseidel/test/Input/WSFV_f_20250131.txt


In [161]:
!free -h

               total        used        free      shared  buff/cache   available
Mem:           754Gi       241Gi       167Gi        55Mi       354Gi       512Gi
Swap:             0B          0B          0B


### Import Data Files to DataFrames

In [162]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell:
#
#   1. Defines a helper function `import_file_to_dataframe` that reads a pipe-delimited
#      text file into a pandas DataFrame (all columns as string; returns None on error).
#   2. Imports a list of "input" files from Input_file_path into DataFrames
#      (RelevantItems, CountryCodes, ...), storing them in globals() by filename.
#   3. Imports a list of "temp" files from Temp_file_path_EoC into DataFrames
#      (ADR_clean, CompanyName_clean, CurrencyCodes_clean, FYE_clean, ID_clean,
#       UpdateCodes_clean, ValueCoding), also stored in globals().
#   4. Identifies which subset_*.txt files exist in Subset_file_path based on the IDs
#      listed in RelevantItems.txt, and records their names (without .txt) in
#      `successful_subset_names`.
#
# No actual subset data is loaded here; that is deferred to later steps to keep
# memory usage under control.


# Function to import a file and return a pandas DataFrame
def import_file_to_dataframe(file_path):
    """
    Import a pipe-separated text file as a pandas DataFrame.

    - Uses sep='|' to read pipe-delimited files.
    - Reads all columns as strings (dtype=str), which helps preserve things like
      leading zeros in codes (e.g., NatCo, ItemCode).
    - Returns None on failure and prints a brief error message.
    """
    try:
        df = pd.read_csv(file_path, sep='|', dtype=str)
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error importing file {file_path}: {e}")
        return None


# -------------------------------------------------------------------------
# Import files from Input directory
# -------------------------------------------------------------------------
input_files_to_import = ["RelevantItems.txt", "CountryCodes.txt"]

for file_name in input_files_to_import:
    file_path = os.path.join(Input_file_path, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "RelevantItems"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Import files from Temp directory (end-of-cleaning stage)
# -------------------------------------------------------------------------
temp_files_to_import = [
    "ADR_clean.txt",
    "CompanyName_clean.txt",
    "CurrencyCodes_clean.txt",
    "FYE_clean.txt",
    "ID_clean.txt",
    "UpdateCodes_clean.txt",
    "ValueCoding.txt"
]

for file_name in temp_files_to_import:
    file_path = os.path.join(Temp_file_path_EoC, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "ADR_clean"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Identify subset files that exist for the relevant items
# -------------------------------------------------------------------------
successful_subset_names = []

if 'RelevantItems' in globals() and RelevantItems is not None:
    # Assume first column of RelevantItems holds the item IDs used in subset filenames
    relevant_ids = RelevantItems.iloc[:, 0].astype(str).tolist()

    print("\nIdentifying subset files to process...")
    for item_id in relevant_ids:
        file_name = f"subset_{item_id}.txt"
        file_path = os.path.join(Subset_file_path, file_name)

        # Check the existence of each candidate subset file
        if os.path.exists(file_path):
            successful_subset_names.append(f"subset_{item_id}")
            print(f"  Found {file_name}")
        else:
            print(f"  File not found: {file_name}. Skipping.")

    print(f"\nIdentified {len(successful_subset_names)} subset files for processing.")
else:
    print("RelevantItems DataFrame not found or is empty. Cannot identify subset files.")

# Note: actual loading and processing of subset files happens later, in
# batch-based steps, to manage memory usage.



Imported RelevantItems.txt as DataFrame 'RelevantItems'
Preview of 'RelevantItems':
  ItemCode
0    01001
1    01051
2    01075
3    01101
4    01151 


Imported CountryCodes.txt as DataFrame 'CountryCodes'
Preview of 'CountryCodes':
  NatCo ImplCountry
0   012     Algeria
1   440   Lithuania
2   025   Argentina
3   442  Luxembourg
4   036   Australia 


Imported ADR_clean.txt as DataFrame 'ADR_clean'
Preview of 'ADR_clean':
          ID ADRIndicator
0  C036F63D0            N
1  C056879S0            X
2  C2461T100            N
3  C2504O500            N
4  C250C9180            N 


Imported CompanyName_clean.txt as DataFrame 'CompanyName_clean'
Preview of 'CompanyName_clean':
          ID                               CompanyName
0  C00948205             AGRIFORCE GROWING SYSTEMS LTD
1  C02500770            PEUGEOT CITROEN ARGENTINA S.A.
2  C02520200  ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA
3  C02520220                       ALPARGATAS S.A.I.C.
4  C02520230               ALUAR ALUMINI

# 4.0. Extracting the most recent, annualized values per PIT Date (incl. Plausibility checks for the data)

## 4.1. Split according to source

In [163]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell processes a ValueCoding DataFrame and assigns a Category to each item
# (per sanitized item name), based on its data sources:
#
#   1. Validates that `ValueCoding` exists and is non-empty.
#   2. Sanitizes `ItemName` to a filesystem-safe `ItemName_Sanitized` (same rules as
#      used for filenames).
#   3. Normalizes the `Source` column (string type, trimmed).
#   4. Groups all distinct sources per `ItemName_Sanitized`.
#   5. Uses `decide_category` to map each sanitized name to a Category:
#        - Hardcoded overrides for certain items.
#        - Generic rules:
#             * presence of IS / Other  -> "Mixed"
#             * presence of Market / BS -> "Annualized"
#             * presence of CFS         -> "Special"
#        - otherwise                    -> None
#   6. Attaches the Category back to each row based on `ItemName_Sanitized`.
#   7. Creates three unique-item DataFrames:
#        - `annualized_items`
#        - `mixed_items`
#        - `special_items`
#   8. Exposes the processed objects in `globals()` for use in later cells.
#   9. Shows a sample and prints counts of each category.
#
# If `ValueCoding` is not present or is empty, processing is skipped.

# CELL 1 — Process ValueCoding and assign Category per ItemName_Sanitized

if 'ValueCoding' in globals() and ValueCoding is not None and not ValueCoding.empty:
    # Inform that processing of ValueCoding is starting
    print("Processing ValueCoding DataFrame...")

    # Work on a copy to avoid mutating the original ValueCoding
    value_coding_processed = ValueCoding.copy()

    # --- Sanitize ItemName ---
    # Ensure ItemName is treated as string (avoid issues with numbers / NaNs)
    value_coding_processed['ItemName'] = value_coding_processed['ItemName'].astype(str)

    # First pass: replace spaces and certain filesystem-unsafe characters with underscores
    # Same rule set as used for building filenames elsewhere
    value_coding_processed['ItemName_Sanitized'] = value_coding_processed['ItemName'].str.replace(
        r'[ \-/\:\\*\?"<>|]', '_', regex=True
    )
    # Second pass: strip any remaining characters not in [word chars, dot, hyphen]
    value_coding_processed['ItemName_Sanitized'] = value_coding_processed['ItemName_Sanitized'].str.replace(
        r'[^\w.-]', '', regex=True
    )

    # --- Normalize Source ---
    # Convert Source to string and strip leading/trailing whitespace
    value_coding_processed['Source'] = (
        value_coding_processed['Source']
        .astype(str)
        .str.strip()
    )

    # ------------------------------------------------------------------
    # Group all sources per sanitized name
    # ------------------------------------------------------------------
    # For each ItemName_Sanitized, collect the set of all non-null sources
    sources_per_name = (
        value_coding_processed
        .groupby('ItemName_Sanitized')['Source']
        .apply(lambda s: set(s.dropna()))
        .to_dict()
    )

    # ------------------------------------------------------------------
    # Helper to decide category per sanitized name
    # ------------------------------------------------------------------
    def decide_category(name, sources: set):
        """
        Decide a category string ("Mixed", "Annualized", "Special", or None)
        for a given sanitized item name based on its set of sources.
        """
        # Item-specific overrides (these take precedence over generic rules)
        if name == 'Depreciation_Depletion__Amortization':
            # Prefer 'IS' interpretation -> treat as Mixed
            return 'Mixed'
        if name == 'Minority_Interest':
            # Prefer 'BS' interpretation -> treat as Annualized
            return 'Annualized'

        # Generic rules:

        # If any of the sources is Income Statement or "Other", classify as Mixed
        if any(src in ['IS', 'Market'] for src in sources):
            return 'Mixed'

        # If any of the sources is Market or Balance Sheet, classify as Annualized
        if any(src in ['BS'] for src in sources):
            return 'Annualized'

        # If any of the sources is Cash Flow Statement, classify as Special
        if any(src in ['CFS'] for src in sources):
            return 'Special'

        # If none of the rules matched, leave as None (no clear mapping)
        return None

    # ------------------------------------------------------------------
    # Build category_map for all sanitized names
    # ------------------------------------------------------------------
    # Map each sanitized item name to a Category by applying decide_category
    category_map = {
        name: decide_category(name, srcs)
        for name, srcs in sources_per_name.items()
    }

    # Attach final Category back to each row, via ItemName_Sanitized
    value_coding_processed['Category'] = (
        value_coding_processed['ItemName_Sanitized'].map(category_map)
    )

    # ------------------------------------------------------------------
    # Derive annualized_items / mixed_items / special_items
    # as unique per sanitized name
    # ------------------------------------------------------------------
    annualized_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Annualized']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )
    mixed_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Mixed']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )
    special_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Special']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )

    # ------------------------------------------------------------------
    # Export to globals for use in later cells
    # ------------------------------------------------------------------
    globals()['value_coding_processed'] = value_coding_processed
    globals()['annualized_items'] = annualized_items
    globals()['mixed_items'] = mixed_items
    globals()['special_items'] = special_items
    globals()['category_map'] = category_map

    # ------------------------------------------------------------------
    # Display sample and counts
    # ------------------------------------------------------------------
    print("\nProcessed ValueCoding DataFrame (sample):")
    display(value_coding_processed.head())

    print(f"\nNumber of Annualized items: {len(annualized_items)}")
    print(f"Number of Mixed items: {len(mixed_items)}")
    print(f"Number of Special items: {len(special_items)}")

else:
    # If ValueCoding is not available or has no rows, skip processing
    print("ValueCoding DataFrame not found or is empty. Skipping processing.")


Processing ValueCoding DataFrame...

Processed ValueCoding DataFrame (sample):


Unnamed: 0,ItemCode,ItemName,Source,ItemName_Sanitized,Category
0,5006,Market Price Current,Market,Market_Price_Current,Mixed
1,5007,Market Price YTD High Current,Market,Market_Price_YTD_High_Current,Mixed
2,5008,Market Price YTD Low Current,Market,Market_Price_YTD_Low_Current,Mixed
3,5009,Date of Current Price,Market,Date_of_Current_Price,Mixed
4,5091,Market Price 52 Week High Current,Market,Market_Price_52_Week_High_Current,Mixed



Number of Annualized items: 148
Number of Mixed items: 141
Number of Special items: 62


### Sort into correct bucket

In [164]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# CELL 2 maps work_subset_*.txt files to categories ("Annualized", "Mixed", "Special")
# based on the ItemName_Sanitized that was derived in the previous cell.
#
# Steps:
#   1. Check that the required categorized DataFrames (annualized_items, mixed_items,
#      special_items) and the temporary directory path Temp_file_path_DP exist.
#   2. Build three sets of sanitized item names (annualized_names, mixed_names,
#      special_names) from those DataFrames.
#   3. List all files in Temp_file_path_DP and filter for those matching
#      "work_subset_*.txt".
#   4. For each work_subset file:
#        - Extract the sanitized item name from the filename.
#        - Determine whether it belongs to Mixed, Annualized, or Special based on
#          the sets created in step 2.
#        - Assign it a variable name (Mixed_n, Annualized_n, Special_n) and store
#          that mapping in dicts mixed_vars, annualized_vars, special_vars.
#   5. Store these dicts in globals() for use in later cells.
#   6. Print summary information and display the created dictionaries.
#   7. Perform garbage collection at the end.
#
# If any of the prerequisites are missing, it prints a message and skips the mapping.


# CELL 2 — Map work_subset files to categories using ItemName_Sanitized

if ('annualized_items' not in globals() or annualized_items is None or
    'mixed_items' not in globals() or mixed_items is None or
    'special_items' not in globals() or special_items is None or
    'Temp_file_path_DP' not in globals()):
    # If required DataFrames or directory path are missing, do not proceed
    print("Required DataFrames (annualized_items, mixed_items, special_items) or Temp_file_path_DP not found. Please run the categorization cell.")
else:
    print("Identifying work_subset files and creating variables based on categories...")

    # Sets of sanitized names that are final Annualized/Mixed/Special
    annualized_names = set(annualized_items['ItemName_Sanitized'].dropna())
    mixed_names      = set(mixed_items['ItemName_Sanitized'].dropna())
    special_names    = set(special_items['ItemName_Sanitized'].dropna())

    # ------------------------------------------------------------------
    # Get a list of work_subset files from the temp directory
    # ------------------------------------------------------------------
    temp_files = os.listdir(Temp_file_path_DP)
    work_subset_files = [
        f for f in temp_files
        if f.startswith('work_subset_') and f.endswith('.txt')
    ]

    # Dictionaries to hold mappings:
    #   "Annualized_1" -> "SomeItemName"
    #   "Mixed_1"      -> "AnotherItemName"
    #   "Special_1"    -> "SpecialItemName"
    annualized_vars = {}
    mixed_vars = {}
    special_vars = {}

    print(f"\nFound {len(work_subset_files)} work_subset files in Temp directory.")

    # Sort files to have deterministic order when assigning variable names
    work_subset_files.sort()

    # Counters for how many items fall into each category; used for variable suffixes
    annualized_count = 0
    mixed_count = 0
    special_count = 0

    # ------------------------------------------------------------------
    # Iterate over each work_subset file and map it to a category
    # ------------------------------------------------------------------
    for file_name in work_subset_files:
        # Extract sanitized item name from filename, expecting "work_subset_<name>.txt"
        match = re.match(r'work_subset_(.+)\.txt$', file_name)
        if not match:
            print(f"  Filename format not as expected for '{file_name}'. Skipping processing.")
            continue

        sanitized_item_name = match.group(1)

        # Use the resolved sets. No more ambiguous precedence:
        # priority Mixed -> Annualized -> Special, in this order of checks.
        if sanitized_item_name in mixed_names:
            mixed_count += 1
            var_name = f"Mixed_{mixed_count}"
            mixed_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Mixed (variable '{var_name}').")

        elif sanitized_item_name in annualized_names:
            annualized_count += 1
            var_name = f"Annualized_{annualized_count}"
            annualized_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Annualized (variable '{var_name}').")

        elif sanitized_item_name in special_names:
            special_count += 1
            var_name = f"Special_{special_count}"
            special_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Special (variable '{var_name}').")

        else:
            # No category mapping found for this sanitized name
            print(f"  '{file_name}' -> No matching Category (might be unmapped or ambiguous). Skipping.")

    # ------------------------------------------------------------------
    # Expose the mapping dictionaries globally for later use
    # ------------------------------------------------------------------
    globals()['annualized_vars'] = annualized_vars
    globals()['mixed_vars'] = mixed_vars
    globals()['special_vars'] = special_vars

    # ------------------------------------------------------------------
    # Summary output and inspection
    # ------------------------------------------------------------------
    print(f"\nVariable creation complete.")
    print(f"Created {len(annualized_vars)} Annualized variables.")
    print(f"Created {len(mixed_vars)} Mixed variables.")
    print(f"Created {len(special_vars)} Special variables.")

    print("\nAnnualized Variables:")
    display(annualized_vars)

    print("\nMixed Variables:")
    display(mixed_vars)

    print("\nSpecial Variables:")
    display(special_vars)

    # Run garbage collection after building mappings
    gc.collect()


Identifying work_subset files and creating variables based on categories...

Found 49 work_subset files in Temp directory.
  'work_subset_Accounts_Payable.txt' -> Annualized (variable 'Annualized_1').
  'work_subset_Capital_Expenditures_Addtns_to_Fixed_Assets.txt' -> Special (variable 'Special_1').
  'work_subset_Cash_Dividends_Paid___Total.txt' -> Special (variable 'Special_2').
  'work_subset_Cash__Short_Term_Investments.txt' -> Annualized (variable 'Annualized_2').
  'work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc..txt' -> Special (variable 'Special_3').
  'work_subset_Common_Equity.txt' -> Annualized (variable 'Annualized_3').
  'work_subset_Cost_of_Goods_Sold_Excl_Depreciation.txt' -> Mixed (variable 'Mixed_1').
  'work_subset_Current_Assets___Total.txt' -> Annualized (variable 'Annualized_4').
  'work_subset_Current_Liabilities___Total.txt' -> Annualized (variable 'Annualized_5').
  'work_subset_Deferred_Taxes.txt' -> Annualized (variable 'Annualized_6').
  'work_subset_Depre

{'Annualized_1': 'Accounts_Payable',
 'Annualized_2': 'Cash__Short_Term_Investments',
 'Annualized_3': 'Common_Equity',
 'Annualized_4': 'Current_Assets___Total',
 'Annualized_5': 'Current_Liabilities___Total',
 'Annualized_6': 'Deferred_Taxes',
 'Annualized_7': 'Income_Taxes_Payable',
 'Annualized_8': 'Inventories___Total',
 'Annualized_9': 'Investments_in_Associated_Companies',
 'Annualized_10': 'Investments_in_Sales__Direct_Financing_Leases',
 'Annualized_11': 'Long_Term_Debt',
 'Annualized_12': 'Long_Term_Receivables',
 'Annualized_13': 'Minority_Interest',
 'Annualized_14': 'Other_Assets___Total',
 'Annualized_15': 'Other_Current_Assets',
 'Annualized_16': 'Other_Current_Liabilities',
 'Annualized_17': 'Other_Investments',
 'Annualized_18': 'Other_Liabilities',
 'Annualized_19': 'Preferred_Stock',
 'Annualized_20': 'Property_Plant__Equipment___Net',
 'Annualized_21': 'ReceivablesNet',
 'Annualized_22': 'Short_Term_Debt__Current_Portion_of_LT_Debt',
 'Annualized_23': 'Total_Assets'


Mixed Variables:


{'Mixed_1': 'Cost_of_Goods_Sold_Excl_Depreciation',
 'Mixed_2': 'Depreciation_Depletion__Amortization',
 'Mixed_3': 'Earnings_Per_Share_Fiscal_Year_End',
 'Mixed_4': 'Income_Taxes',
 'Mixed_5': 'Interest_Expense___Total',
 'Mixed_6': 'Net_Income_Before_Extra_Items_Preferred_Divs',
 'Mixed_7': 'Net_Income_Used_to_Calculate_Basic_EPS',
 'Mixed_8': 'Net_Sales_or_Revenues',
 'Mixed_9': 'Operating_Income',
 'Mixed_10': 'Sales_Per_Share',
 'Mixed_11': 'Selling_General__Administrative_Expenses'}


Special Variables:


{'Special_1': 'Capital_Expenditures_Addtns_to_Fixed_Assets',
 'Special_2': 'Cash_Dividends_Paid___Total',
 'Special_3': 'Com_Pfd_Redeemed_Retired_Converted_Etc.',
 'Special_4': 'Disposal_of_Fixed_Assets',
 'Special_5': 'Extraordinary_Items',
 'Special_6': 'Funds_From_For_Other_Operating_Activities',
 'Special_7': 'Funds_From_Operations',
 'Special_8': 'Long_Term_Borrowings',
 'Special_9': 'Net_Cash_Flow___Financing',
 'Special_10': 'Net_Cash_Flow___Investing',
 'Special_11': 'Net_Cash_Flow___Operating_Activities',
 'Special_12': 'Net_Proceeds_From_Sale_Issue_of_Com__Pfd',
 'Special_13': 'Reduction_in_Long_Term_Debt'}

## 4.4. Balance Sheet (1-9)

### Annualized 1

#### Set Index

In [165]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 1  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_1  ->  ItemName: 'Accounts_Payable'


#### Import relevant data



In [166]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Accounts_Payable' ...
Full annualized dataset loaded successfully: 3,058,863 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3040,380.421036
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3040,369.24053
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3040,401.581395
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3040,228.545754
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3040,264.087772


0

#### Encode Frequency Code (Check of output required!)

In [167]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Accounts_Payable' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3040,380.421036,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3040,369.24053,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3040,401.581395,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3040,228.545754,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3040,264.087772,Y96


#### Annualize data with most recent information (Check of output required!)

In [168]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 3,058,863 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 2953426
                mean: 5107362.038584086
              median: 100.0
winsorized_mean_1pct: 115.44289714446015
                 p10: 79.59068207901444
                 p20: 99.07344279761138
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 102.59893249811991
                 p90: 137.964828798095

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 249,448

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2703978
                mean: 101.528507115408

### Annualized 2

#### Set Index

In [169]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 2  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_2  ->  ItemName: 'Cash__Short_Term_Investments'


#### Import relevant data



In [170]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Cash__Short_Term_Investments' ...
Full annualized dataset loaded successfully: 3,624,152 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2001,167.849993
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2001,320.816779
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2001,167.728265
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2001,110.705844
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2001,70.212302


0

#### Encode Frequency Code (Check of output required!)

In [171]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Cash__Short_Term_Investments' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2001,167.849993,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2001,320.816779,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2001,167.728265,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2001,110.705844,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2001,70.212302,Y96


#### Annualize data with most recent information (Check of output required!)

In [172]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 3,624,152 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 3504849
                mean: 23366.10316848247
              median: 100.0
winsorized_mean_1pct: 104.68099707984614
                 p10: 70.41967537889334
                 p20: 93.21114357569256
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0003420353733
                 p90: 122.0449509555434

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 306,344

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 3198505
                mean: 100.384050389136

### Annualized 3

#### Set Index

In [173]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 3  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_3  ->  ItemName: 'Common_Equity'


#### Import relevant data



In [174]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Common_Equity' ...
Full annualized dataset loaded successfully: 3,633,273 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3501,378.411011
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3501,434.756779
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3501,518.204722
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3501,386.14278
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3501,387.109014


0

#### Encode Frequency Code (Check of output required!)

In [175]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Common_Equity' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3501,378.411011,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3501,434.756779,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3501,518.204722,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3501,386.14278,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3501,387.109014,Y96


#### Annualize data with most recent information (Check of output required!)

In [176]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 3,633,273 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 3530172
                mean: 28242.62116630743
              median: 100.0
winsorized_mean_1pct: 99.00167277420879
                 p10: 90.74869375256846
                 p20: 97.06041111110679
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 103.58511892704527

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 119,253

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 3410919
                mean: 99.7094299332743
           

### Annualized 4

#### Set Index

In [177]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 4  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_4  ->  ItemName: 'Current_Assets___Total'


#### Import relevant data



In [178]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Current_Assets___Total' ...
Full annualized dataset loaded successfully: 3,551,641 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2201,748.140365
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2201,864.906741
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2201,975.879591
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2201,762.389782
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2201,665.184633


0

#### Encode Frequency Code (Check of output required!)

In [179]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Current_Assets___Total' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2201,748.140365,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2201,864.906741,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2201,975.879591,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2201,762.389782,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2201,665.184633,Y96


#### Annualize data with most recent information (Check of output required!)

In [180]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 3,551,641 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 3448829
                mean: 23618.976151775518
              median: 100.0
winsorized_mean_1pct: 100.29664618582204
                 p10: 87.85928855984785
                 p20: 97.47523938800086
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.21294398824752
                 p90: 109.39895940062765

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 88,373

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 3360456
                mean: 100.1777373893

### Annualized 5

#### Set Index

In [181]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 5  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_5  ->  ItemName: 'Current_Liabilities___Total'


#### Import relevant data



In [182]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Current_Liabilities___Total' ...
Full annualized dataset loaded successfully: 3,568,264 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3101,663.515421
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3101,693.189519
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3101,773.728435
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3101,852.251409
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3101,841.844876


0

#### Encode Frequency Code (Check of output required!)

In [183]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Current_Liabilities___Total' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3101,663.515421,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3101,693.189519,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3101,773.728435,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3101,852.251409,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3101,841.844876,Y96


#### Annualize data with most recent information (Check of output required!)

In [184]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 3,568,264 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 3466846
                mean: 1573506.9621457502
              median: 100.0
winsorized_mean_1pct: 99.4730978559391
                 p10: 81.03415010611616
                 p20: 94.70694669530315
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 110.40676222588017

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 118,269

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 3348577
                mean: 99.41491224027521
          

### Annualized 6

#### Set Index

In [185]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 6  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_6  ->  ItemName: 'Deferred_Taxes'


#### Import relevant data



In [186]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Deferred_Taxes' ...
Full annualized dataset loaded successfully: 2,861,796 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1997,December,3263,0.0
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,3,1998,December,3263,1.90201
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,2000-05-19,A,2,1999,October,3263,1.252426
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1987,June,3263,0.0026782265
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1991,June,3263,0.0


0

#### Encode Frequency Code (Check of output required!)

In [187]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Deferred_Taxes' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1997,December,3263,0.0,Y97
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,3,1998,December,3263,1.90201,Y98
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,2000-05-19,A,2,1999,October,3263,1.252426,Y99
3,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1987,June,3263,0.0026782265,Y87
4,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,Argentina,Ars,Ars,1996-05-03,A,3,1991,June,3263,0.0,Y91


#### Annualize data with most recent information (Check of output required!)

In [188]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 2,861,796 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 2424156
                mean: 15695.481367535955
              median: 100.0
winsorized_mean_1pct: 95.98993743459458
                 p10: 52.828383544285686
                 p20: 90.59805665799917
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 116.12178467155692

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 311,755

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2112401
                mean: 100.5711365907513
        

### Annualized 7

#### Set Index

In [189]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 7  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_7  ->  ItemName: 'Income_Taxes_Payable'


#### Import relevant data



In [190]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Income_Taxes_Payable' ...
Full annualized dataset loaded successfully: 1,979,636 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3063,10.582409
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3063,12.508193
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3063,8.751102
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3063,9.041866
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3063,9.454759


0

#### Encode Frequency Code (Check of output required!)

In [191]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Income_Taxes_Payable' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,3063,10.582409,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,3063,12.508193,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,3063,8.751102,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,3063,9.041866,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,3063,9.454759,Y96


#### Annualize data with most recent information (Check of output required!)

In [192]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 1,979,636 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1776396
                mean: 247102.09123122873
              median: 100.0
winsorized_mean_1pct: 143.2997482565826
                 p10: 52.03050494652123
                 p20: 90.74235927873305
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 101.39946722695382
                 p90: 175.95969101080811

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 321,621

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1454775
                mean: 101.2298320945

### Annualized 8

#### Set Index

In [193]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 8  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_8  ->  ItemName: 'Inventories___Total'


#### Import relevant data



In [194]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Inventories___Total' ...
Full annualized dataset loaded successfully: 3,452,774 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2101,237.4096
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2101,201.182443
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2101,289.925405
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2101,223.628192
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2101,177.096926


0

#### Encode Frequency Code (Check of output required!)

In [195]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Inventories___Total' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2101,237.4096,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2101,201.182443,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2101,289.925405,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2101,223.628192,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2101,177.096926,Y96


#### Annualize data with most recent information (Check of output required!)

In [196]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 3,452,774 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 2901116
                mean: 25288.08700486864
              median: 100.0
winsorized_mean_1pct: 102.10634046382053
                 p10: 86.05661339096373
                 p20: 98.49794910040086
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 101.44238389602155
                 p90: 115.48123728513532

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 124,089

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2777027
                mean: 101.1651566502

### Annualized 9

#### Set Index

In [197]:
# =============================================================================
# SELECT A SINGLE ANNUALIZED_* ITEM AND PREPARE PATHS
# =============================================================================
# This cell:
#   1. Selects which Annualized_* item (from annualized_vars) should be processed.
#   2. Validates that annualized_vars and Temp_file_path_DP are available.
#   3. Builds the input file path for the selected "work_subset_<item>.txt".
#   4. Defines a base_output_filename used later when saving processed results.
#   5. Ensures the data-preparation temp directory exists.
#
# Usage:
#   - Change `annualized_index` to process a different Annualized_* dataset
#     (e.g. 1, 2, 10, ...).
#   - Assumes `annualized_vars` was created earlier (mapping "Annualized_n" to
#     sanitized item names) and `Temp_file_path_DP` was set in your environment
#     setup cell.
# =============================================================================

import os
from pathlib import Path

# 1) Choose which Annualized_* item to run
annualized_index = 9  # change this to re-run a different dataset

# annualized_vars should look like: {'Annualized_1': 'SomeItem', ...}
assert 'annualized_vars' in globals(), "annualized_vars dict not found in globals()."

# Build the key and fetch the corresponding sanitized item name
item_key = f"Annualized_{annualized_index}"
target_item_name = annualized_vars.get(item_key)
assert target_item_name, f"{item_key} not found in annualized_vars."

print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# 2) Construct file paths based on the selected item
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Input file produced by earlier steps (merging characteristics etc.)
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Base name for all output files created in the annualized pipeline
base_output_filename = f"Annualized_{target_item_name}_complete"

# 3) Ensure the output directory exists
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Annualized_9  ->  ItemName: 'Investments_in_Associated_Companies'


#### Import relevant data



In [198]:
# =============================================================================
# LOAD THE FULL DATASET FOR THE SELECTED SPECIAL ITEM (ANNUALIZED VERSION)
# =============================================================================
# This cell:
#   1. Uses `target_item_name` and `file_path` (defined in the previous cell)
#      to load the corresponding work_subset file.
#   2. Imports the file using `import_file_to_dataframe`.
#   3. Performs safety checks for existence and emptiness.
#   4. Shows a preview of the loaded dataset.
#   5. Falls back to an empty DataFrame if loading fails.
#   6. Runs garbage collection afterwards.
# =============================================================================

print(f"\nImporting full annualized dataset for Item: '{target_item_name}' ...")

if os.path.exists(file_path):
    annualized_raw = import_file_to_dataframe(file_path)

    if annualized_raw is not None and not annualized_raw.empty:
        print(f"Full annualized dataset loaded successfully: {len(annualized_raw):,} rows total.")
        try:
            display(annualized_raw.head())
        except Exception:
            print(annualized_raw.head().to_string(index=False))
    else:
        print("Annualized dataset appears empty or could not be loaded.")
        annualized_raw = pd.DataFrame()

else:
    print(f"File not found: {file_path}")
    annualized_raw = pd.DataFrame()

gc.collect()



Importing full annualized dataset for Item: 'Investments_in_Associated_Companies' ...
Full annualized dataset loaded successfully: 2,761,273 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2256,52.290086
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2256,12.353737
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2256,15.324648
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2256,14.287399
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2256,74.116172


0

#### Encode Frequency Code (Check of output required!)

In [199]:
# =============================================================================
# FISCAL PERIOD ENCODING FOR ANNUALIZED DATASET
# =============================================================================
# This cell:
#   1. Defines helper functions:
#        - last2: extract last two digits of a number as a zero-padded string.
#        - add_str_fiscalprd: create Str_FiscalPrd from numeric FiscalPeriod
#          and Frequency, derive an implied full-year FiscalPeriod, and check
#          for inconsistencies on annual rows.
#   2. Applies this encoding to `annualized_raw` (if available) and stores
#      the result in `annualized_encoded`.
#   3. Shows a preview of the encoded DataFrame.
#
# Assumptions:
#   - `annualized_raw` has already been loaded in a previous cell.
#   - `target_item_name` is defined and is just used for printing context.
#   - DataFrame contains at least the columns: 'Frequency', 'FiscalPeriod'.
# =============================================================================

import numpy as np
import pandas as pd
from IPython.display import display


def last2(n):
    """
    Return the last two digits of a number as a zero-padded string.

    Examples:
        n = 2023 -> "23"
        n = 85   -> "85"
        n = NaN  -> None
    """
    if pd.isna(n):
        return None
    # Format as 4-digit integer (e.g. 23 -> "0023") and take the last 2 characters.
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build 'Str_FiscalPrd' and overwrite 'FiscalPeriod' with an implied full year.

    Logic:
      1) Normalize Frequency to uppercase string.
      2) For each row, interpret numeric FiscalPeriod depending on Frequency
         and create a string fiscal-period label Str_FiscalPrd:
           - C, Q, E, R: quarter-based  -> "Q{1-4}Y{yy}"
           - A, B:       annual         -> "Y{yy}"
           - F, S:       semiannual     -> "S{1-2}Y{yy}"
           - K, T, L, U: trimester-like -> "T{1-3}Y{yy}"
         The numbers (1..n) come from arithmetic on the numeric FiscalPeriod.
      3) Extract the "yy" part from Str_FiscalPrd and map to a full year:
           yy >= 80 -> 19yy (e.g. "85" -> 1985)
           yy <  80 -> 20yy (e.g. "23" -> 2023)
         This becomes ImplFiscPer_Calculated.
      4) For rows with annual frequency (A,B), compare ImplFiscPer_Calculated
         to the original FiscalPeriod and print a short discrepancy summary.
      5) Overwrite 'FiscalPeriod' with ImplFiscPer_Calculated and drop the
         helper columns used for the check.

    Returns:
      A new DataFrame with:
        - 'Str_FiscalPrd'
        - updated 'FiscalPeriod' (full-year integer)
    """
    df = df.copy()

    # Normalize frequency codes for consistent logic
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod for validation later
    df["Original_FiscalPeriod"] = df["FiscalPeriod"]

    # Numeric version of FiscalPeriod for modular arithmetic
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Frequency masks
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])
    m_AB      = df["Frequency"].isin(["A", "B"])
    m_FS      = df["Frequency"].isin(["F", "S"])
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])

    # Initialize column for string fiscal period
    df["Str_FiscalPrd"] = np.nan

    # --- Quarter-based (C, Q, E, R) ---
    # Quarter number: 1..4
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year component (integer), then reduced to last 2 digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" + q_part.astype("Int64").astype(str) + "Y" + q_year.fillna("")
    )

    # --- Annual (A, B) ---
    ab_year = fp.where(m_AB).apply(last2)
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna("")

    # --- Semiannual (F, S) ---
    fs_sem  = ((fp % 2) + 1).where(m_FS)     # semester index 1 or 2
    fs_year = (fp // 2).where(m_FS).apply(last2)
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" + fs_sem.astype("Int64").astype(str) + "Y" + fs_year.fillna("")
    )

    # --- Trimester-like (K, T, L, U) ---
    t_term = ((fp % 3) + 1).where(m_KTLU)    # term index 1..3
    t_year = (fp // 3).where(m_KTLU).apply(last2)
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" + t_term.astype("Int64").astype(str) + "Y" + t_year.fillna("")
    )

    # --- Derive implied full-year FiscalPeriod from Str_FiscalPrd ---
    # Extract the "yy" part following "Y" in labels like "Q1Y23", "Y21", etc.
    year_part = df["Str_FiscalPrd"].str.extract(r"Y(\d{2})", expand=False)
    year_numeric = pd.to_numeric(year_part, errors="coerce")

    # Map yy to either 19yy or 20yy, depending on cutoff at 80
    df["ImplFiscPer_Calculated"] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # --- Discrepancy check for annual rows (A,B only) ---
    annual_rows_for_check = df[m_AB].copy()
    discrepancy_mask_annual = ~(
        # Case 1: numeric equality
        (
            annual_rows_for_check["ImplFiscPer_Calculated"]
            == pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            )
        )
        |
        # Case 2: both fields are NaN
        (
            annual_rows_for_check["ImplFiscPer_Calculated"].isna()
            & pd.to_numeric(
                annual_rows_for_check["Original_FiscalPeriod"],
                errors="coerce"
            ).isna()
        )
    )
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    if not discrepancy_rows.empty:
        print(
            "\nDiscrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows:"
        )
        display(
            discrepancy_rows[
                ["ID", "Frequency", "Original_FiscalPeriod",
                 "Str_FiscalPrd", "ImplFiscPer_Calculated"]
            ].head()
        )
        print(f"Total discrepancies for annual frequencies: {len(discrepancy_rows)}")
    else:
        print(
            "\nNo discrepancies between original FiscalPeriod and calculated "
            "ImplFiscPer for annual (A, B) rows."
        )

    # Overwrite FiscalPeriod with the implied year
    df["FiscalPeriod"] = df["ImplFiscPer_Calculated"]

    # Remove helper columns that are no longer needed
    df.drop(columns=["Original_FiscalPeriod", "ImplFiscPer_Calculated"], inplace=True)

    return df


# -----------------------------------------------------------------------------
# Apply encoding to the Annualized dataset
# -----------------------------------------------------------------------------
if "annualized_raw" in globals() and annualized_raw is not None and not annualized_raw.empty:
    print(f"Applying fiscal period encoding to Annualized dataset for '{target_item_name}' ...")
    annualized_encoded = add_str_fiscalprd(annualized_raw)
    display(annualized_encoded.head())
else:
    print("annualized_raw not found or empty. Cannot perform encoding.")
    annualized_encoded = None


Applying fiscal period encoding to Annualized dataset for 'Investments_in_Associated_Companies' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies between original FiscalPeriod and calculated ImplFiscPer for annual (A, B) rows.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,2256,52.290086,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,2256,12.353737,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,2256,15.324648,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,2256,14.287399,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,2256,74.116172,Y96


#### Annualize data with most recent information (Check of output required!)

In [200]:
# =============================================================================
# ANNUALIZED PIPELINE: BUILD AnnPITValue FROM A/Q/S/T, QC, AND SAVE OUTPUT
# =============================================================================
# High-level overview:
#
#   1. Helper utilities
#      - _key, asof_numpy:
#          * Implement a fast, vectorized “as-of” join:
#              For each row in a left DataFrame, find the latest value in a
#              right DataFrame with the same keys and PIT Date <= left PIT Date.
#      - pctile, summarize_pct:
#          * Compute robust summary statistics for quality control, including
#            winsorized mean and decile percentiles.
#      - pick_latest_nonzero_within_year:
#          * For a given PIT Date and FiscalPeriod, evaluate all available
#            period values (A, Q1..Q4, S1..S2, T1..T3) with known origin
#            fiscal periods.
#          * Choose a single value as AnnPITValue based on:
#               - same-year vs prior-year vs other origin
#               - period priority (A > Q4 > T3 > S2 > Q3 > ... > Q1)
#               - the latest date within a one-year window before PIT.
#          * NEW: returns both the value and the period label from which it
#                 was chosen (AnnPITValue_Period).
#
#   2. Main pipeline for annualized_encoded:
#      - Filter out unsupported frequencies (E/L/R/U).
#      - Normalize types for PIT Date, FiscalPeriod, Value, and key columns.
#      - Derive QNUM, SNUM, TNUM indices from Str_FiscalPrd.
#      - Ensure all period- and date-columns (A/Q/S/T) exist.
#      - Build TrueValue from annual rows (A,B) as the last observed annual
#        value per (ID, FiscalPeriod, HistCurrency).
#      - Use asof_numpy to populate:
#           A, A_Date, A_OriginFP
#           Q1..Q4, S1..S2, T1..T3 and their dates + OriginFP (from origin FP).
#      - For each row, compute:
#           * AnnPITValue        = chosen value
#           * AnnPITValue_Period = 'A', 'Q4', 'S1', 'T3', etc.
#      - Check for any period dates that are after PIT Date (future-date errors).
#      - Compute AnnPITValue_Pct = AnnPITValue / TrueValue * 100 and drop rows
#        outside [50, 200] or with infinite ratios.
#      - Build a final, lean set of columns and save:
#           * full file:   <base_output_filename>.txt
#           * subset file: <base_output_filename>_subset.txt
#        NEW: AnnPITValue_Period is placed directly before AnnPITValue and
#             included in both full and subset outputs.
#      - Print a row-accounting overview for reconciliation.
# =============================================================================

import os
import gc
from datetime import timedelta

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

pd.options.mode.copy_on_write = True


# -----------------------------------------------------------------------------
# Helper: build a single key column from multiple columns
# -----------------------------------------------------------------------------
def _key(fr: pd.DataFrame, cols):
    """
    Build a composite string key by concatenating several columns with '||'.

    This is used to group records by (ID, HistCurrency, ItemCode, FiscalPeriod)
    as a single vectorizable key for the as-of join.

    Example:
        _key(df, ['ID', 'HistCurrency']) -> "123||USD"
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


# -----------------------------------------------------------------------------
# Helper: fast as-of join (right.PIT <= left.PIT)
# -----------------------------------------------------------------------------
def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df such that:

        1) by_cols are equal on both sides (e.g. ID, HistCurrency, ItemCode, FiscalPeriod)
        2) right_df['PIT Date'] <= left_df['PIT Date']

    Implementation notes:
      - Both left and right PIT Date columns are converted to datetime and floored to days.
      - A composite key '__k' (string) is built from by_cols on both dataframes.
      - The right-hand dataframe is sorted by key and PIT Date.
      - For each distinct key, we keep a slice of PIT Date and Value arrays.
      - Left-hand keys are sorted; for each group of identical keys we:
          * binary-search in the right PIT Date array via np.searchsorted
            to find the index of the last PIT Date <= each left PIT Date.
          * fill out_vals and out_dates at the original left index positions.

    Returns
    -------
    out_vals : np.ndarray
        Array of matched values (float64), default NaN where no match.
    out_dates : np.ndarray
        Array of matched dates (datetime64[ns]), default NaT where no match.
    """
    # Initialize output arrays with NaNs/NaT
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns: keys plus PIT Date and Value on the right
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Drop rows with missing key or PIT Date on either side
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime, day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Composite keys for grouping
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right by key and PIT Date so we can slice by key and binary-search by date
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Convert right side to NumPy arrays
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and the start index of each key block in rk
    uniq, first = np.unique(rk, return_index=True)

    # Pre-slice rdt, rval for each key to avoid repeated filtering
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]
        e = first[i + 1] if i + 1 < len(first) else len(r)
        slices[k] = (rdt[s:e], rval[s:e])

    # Original indices of the filtered left rows
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left keys so that identical keys form contiguous blocks
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # Process each contiguous block of the same key
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]
        j = i + 1
        # identify the block [i, j) with the same key
        while j < n and sk[j] == k:
            j += 1

        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        if k in slices:
            r_dates, r_vals = slices[k]
            # For each left date, search the insertion position in right dates
            # side='right' gives index of first element > date, minus 1 =>
            # index of the last element <= date.
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0
            if np.any(valid):
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        i = j

    return out_vals, out_dates


# -----------------------------------------------------------------------------
# Small helpers for QC statistics
# -----------------------------------------------------------------------------
def pctile(s: pd.Series, q: float):
    """
    Safe wrapper around Series.quantile that returns NaN if anything fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Compute summary statistics for a numeric series, ignoring inf and NaN.

    Returned dictionary keys:
      - finite_rows: number of finite (non-inf, non-NaN) observations
      - mean, median
      - winsorized_mean_1pct: mean after 1% winsorization on both tails
      - p10, p20, ..., p90: decile quantiles from 10% to 90%
    """
    # Replace infinities, drop missing values
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: s.to_numpy().copy() ensures we pass a writable array
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean(),
        "p10": pctile(s, 0.10),
        "p20": pctile(s, 0.20),
        "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40),
        "p50": pctile(s, 0.50),
        "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70),
        "p80": pctile(s, 0.80),
        "p90": pctile(s, 0.90),
    }


# -----------------------------------------------------------------------------
# Period prioritization and label helper
# -----------------------------------------------------------------------------
_PERIOD_PRIORITY = {
    'A': 100,   # annual
    'Q4': 90,
    'T3': 80,
    'S2': 70,
    'Q3': 60,
    'T2': 50,
    'S1': 40,
    'Q2': 30,
    'T1': 20,
    'Q1': 10,
}


def _label_from_colname(colname: str) -> str:
    """
    Map a value column name to a period label.

    Currently this is a thin wrapper:
      - 'A' stays 'A'
      - 'Q1'..'Q4', 'S1'.., 'T1'.. remain unchanged.
    """
    return 'A' if colname == 'A' else colname


# -----------------------------------------------------------------------------
# AnnPITValue selection using OriginFP and priority rules
# -----------------------------------------------------------------------------
def pick_latest_nonzero_within_year(
    row,
    value_cols,
    date_cols,
    pit_col='PIT Date',
    fp_col='FiscalPeriod'
):
    """
    Select a single annualized value (AnnPITValue) for a row, using information
    about:
      - available period values (A, Q1..Q4, S1..S2, T1..T3),
      - their dates,
      - their origin fiscal periods (*_OriginFP),
      - the current PIT Date and FiscalPeriod of the row.

    The logic:
      1) Only consider candidates where:
           - date is not missing,
           - date <= PIT Date,
           - date >= PIT Date - 365 days.
      2) Determine year relation between each candidate's origin fiscal period
         and the current row's FiscalPeriod (FP):
           - same  : OriginFP == FP
           - prior : OriginFP == FP - 1
           - other : everything else
           - unknown: if either FP or OriginFP is missing
      3) For all candidates, ignore NaN and 0.0 values when selecting.
         If we only find zeros and no positive/negative values, we return 0.0
         and still record the period label.
         If there are no candidates at all, return NaN.
      4) Selection priority:
           - same-year Annual ('A', year_rel == 'same'):
               pick the latest by date.
           - else, same-year partials (Q/S/T, year_rel == 'same'):
               pick the highest priority (e.g. Q4 > Q3 > ...), then latest date.
           - else, prior-year Annual ('A', year_rel == 'prior'):
               pick the latest by date (push-forward).
           - else, fallback:
               pick candidate with highest priority, then latest date.

    Returns
    -------
    (value, label)
      value : float or NaN
      label : str or NaN (e.g. 'A', 'Q4', 'S1', 'T3')
    """
    pit = row[pit_col]
    if pd.isna(pit):
        return (np.nan, np.nan)

    cutoff = pit - timedelta(days=365)

    # Current row's fiscal period, used to interpret origin fiscal periods
    fp = row.get(fp_col, np.nan)
    try:
        fp_int = int(fp) if not pd.isna(fp) else None
    except Exception:
        fp_int = None

    candidates = []
    for vcol, dcol in zip(value_cols, date_cols):
        if vcol not in row or dcol not in row:
            continue

        val = row[vcol]
        dt  = row[dcol]

        # Ignore missing or future dates
        if pd.isna(dt) or dt > pit:
            continue

        dt = pd.to_datetime(dt, errors='coerce')
        if pd.isna(dt):
            continue

        dt = dt.floor('D')
        if dt < cutoff:
            # older than 1 year before PIT
            continue

        # Map column name to period label (A, Q1..Q4, etc.) and priority
        label = _label_from_colname(vcol)
        prio  = _PERIOD_PRIORITY.get(label, -1)

        # Convert value to float for numeric comparisons
        vnum  = float(val) if pd.notna(val) else np.nan

        # Determine origin fiscal period
        origin_col = f'{label}_OriginFP'
        origin_fp = row.get(origin_col, np.nan)
        if pd.isna(origin_fp):
            # fallback to current FP if origin not explicitly stored
            origin_fp = fp_int
        try:
            if origin_fp is not None and not pd.isna(origin_fp):
                origin_fp = int(origin_fp)
            else:
                origin_fp = None
        except Exception:
            origin_fp = fp_int

        # Compute relationship between origin fiscal period and current FP
        if fp_int is not None and origin_fp is not None:
            if origin_fp == fp_int:
                year_rel = 'same'
            elif origin_fp == fp_int - 1:
                year_rel = 'prior'
            else:
                year_rel = 'other'
        else:
            year_rel = 'unknown'

        candidates.append((label, prio, dt, vnum, year_rel))

    if not candidates:
        return (np.nan, np.nan)

    # Only non-NaN, non-zero values are considered as strong candidates
    def valid(seq):
        return [c for c in seq if not np.isnan(c[3]) and c[3] != 0.0]

    # 1) Same-year Annual A: prefer the latest annual that matches the row's FP
    same_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'same')
    if same_year_annuals:
        best = max(same_year_annuals, key=lambda x: x[2])  # latest date
        return (best[3], best[0])

    # 2) Same-year partial periods (Q, S, T) if no same-year A is available
    same_year_partials = valid(c for c in candidates if c[0] != 'A' and c[4] == 'same')
    if same_year_partials:
        # choose best by (priority, date)
        best = max(same_year_partials, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # 3) Prior-year annual push-forward: last annual from previous FP
    prior_year_annuals = valid(c for c in candidates if c[0] == 'A' and c[4] == 'prior')
    if prior_year_annuals:
        best = max(prior_year_annuals, key=lambda x: x[2])
        return (best[3], best[0])

    # 4) Fallback: any candidate by (priority, date)
    others = valid(candidates)
    if others:
        best = max(others, key=lambda x: (x[1], x[2]))
        return (best[3], best[0])

    # If we get here, only zeros are present. Return 0.0 explicitly and keep label.
    zeros = [c for c in candidates if not np.isnan(c[3]) and c[3] == 0.0]
    if zeros:
        best_zero = max(zeros, key=lambda x: (x[1], x[2]))
        return (best_zero[3], best_zero[0])

    return (np.nan, np.nan)


# =============================================================================
# MAIN: annualized_encoded -> annualized_processed
# =============================================================================
if 'annualized_encoded' in globals() and annualized_encoded is not None:
    input_rows = len(annualized_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy to avoid mutating the original DataFrame
    working = annualized_encoded.copy()

    # -------------------------------------------------------------------------
    # 1) Exclude frequencies that are not supported by this pipeline (E/L/R/U)
    # -------------------------------------------------------------------------
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    excluded_rows = int(excl_mask.sum())
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # 2) Basic type normalization
    # -------------------------------------------------------------------------
    # PIT Date as datetime (day precision)
    working['PIT Date'] = pd.to_datetime(
        working['PIT Date'], format='%Y-%m-%d', errors='coerce'
    ).dt.floor('D')

    # FiscalPeriod and Value as numeric
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Key-like columns as string (consistent joins and as-of keys)
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # 3) Parse Q/S/T sequence numbers from Str_FiscalPrd
    # -------------------------------------------------------------------------
    # Extract quarter index 1..4 from strings like "Q1Y23"
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual index 1..2 from "S1Y23"
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester index 1..3 from "T1Y23"
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # 4) Ensure A/Q/S/T value and date columns exist
    # -------------------------------------------------------------------------
    period_vals = [f'Q{i}' for i in range(1, 5)] + \
                  [f'S{i}' for i in range(1, 3)] + \
                  [f'T{i}' for i in range(1, 4)] + ['A']

    period_dates = [f'{p}_Date' for p in [f'Q{i}' for i in range(1, 5)] +
                                       [f'S{i}' for i in range(1, 3)] +
                                       [f'T{i}' for i in range(1, 4)]] + ['A_Date']

    # Create missing value/date columns initialized to NaN / NaT
    for c in period_vals:
        if c not in working.columns:
            working[c] = np.nan
    for c in period_dates:
        if c not in working.columns:
            working[c] = pd.NaT

    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 5) Build TrueValue from annual (A,B) rows
    # -------------------------------------------------------------------------
    # TrueValue is the last known annual value per (ID, FiscalPeriod, HistCurrency)
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 6) As-of mapping for each frequency (same FiscalPeriod only)
    # -------------------------------------------------------------------------
    # 6.1 Annual as-of (A, B)
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # 6.2 Quarterly as-of (Q, C)
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.3 Semiannual as-of (S, F)
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # 6.4 Trimester as-of (T, K)
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        mask = working[col].notna() & working[ocol].isna()
        working.loc[mask, ocol] = working.loc[mask, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 7) Normalize date columns (no forward-fill; only directly mapped values)
    # -------------------------------------------------------------------------
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    value_labels  = period_vals
    date_labels   = period_dates
    origin_labels = [f'{lbl}_OriginFP' for lbl in value_labels]

    for c in date_labels:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 8) Compute AnnPITValue and AnnPITValue_Period
    # -------------------------------------------------------------------------
    # NEW: we compute both the chosen annualized value and the period label
    # it came from (e.g., 'A', 'Q4', 'S1', 'T3') and store the label in
    # AnnPITValue_Period.
    ann_res = working.apply(
        lambda r: pd.Series(
            pick_latest_nonzero_within_year(
                r,
                value_cols=value_labels,
                date_cols=date_labels,
                pit_col='PIT Date',
                fp_col='FiscalPeriod'
            ),
            index=['AnnPITValue', 'AnnPITValue_Period']
        ),
        axis=1
    )
    working = pd.concat([working, ann_res], axis=1)

    # -------------------------------------------------------------------------
    # 9) Future-date QC check (period dates > PIT Date)
    # -------------------------------------------------------------------------
    date_cols_all = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    present = [c for c in date_cols_all if c in working.columns]
    viol_counts = {}
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 10) Compute AnnPITValue_Pct vs TrueValue and apply quality filter
    # -------------------------------------------------------------------------
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)
    is_finite = np.isfinite(pct)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    working = working.loc[~to_drop_quality].copy()

    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 11) Final columns and export
    # -------------------------------------------------------------------------
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # NEW: AnnPITValue_Period is included and explicitly placed
    # directly before AnnPITValue in the final column order.
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue_Period', 'AnnPITValue',
         'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Drop helper columns that are only needed for intermediate computations
    drop_cols = [
        c for c in working.columns
        if c.endswith('_OriginFP') or c in ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    ]
    working.drop(columns=drop_cols, inplace=True, errors='ignore')

    annualized_processed = working.reindex(columns=keep_cols)

    # -------------------------------------------------------------------------
    # 12) Save full and subset outputs
    # -------------------------------------------------------------------------
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set earlier)."

    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    annualized_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # NEW: AnnPITValue_Period is included in the subset and appears before
    # AnnPITValue.
    subset_cols = [
        "ID", "PIT Date", "CompanyName", "HistCurrency",
        "FiscalPeriod", "AnnPITValue_Period", "AnnPITValue"
    ]
    subset_cols_existing = [col for col in subset_cols if col in annualized_processed.columns]
    subset_df = annualized_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP, f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    del subset_df

    # -------------------------------------------------------------------------
    # 13) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(annualized_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excluded by Frequency (E/L/R/U):{excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. Please investigate.")

    gc.collect()

else:
    print("annualized_encoded not found or None; skipping.")

Input dataset contains 2,761,273 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary — BEFORE quality drop ===
         finite_rows: 1138479
                mean: 154913.18387507054
              median: 100.0
winsorized_mean_1pct: 97.34392593012758
                 p10: 14.194587333883005
                 p20: 95.38924156364851
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 104.11422880173936

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 158,833

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 979646
                mean: 100.42279177292231
        