### Mount Google Drive, Import Libraries and Define Paths

In [54]:
# =============================================================================
# ENVIRONMENT SETUP + PATH CONFIGURATION (SERVER / COLAB COMPATIBLE)
# =============================================================================

import os
import sys
import importlib
from pathlib import Path
import string
import re
import gc
from datetime import timedelta
from scipy.stats.mstats import winsorize

# -----------------------------------------------------------------------------
# 0) HARD SAFETY: cap native thread usage (prevents pthread_create EAGAIN)
#    MUST be set before importing numpy / scipy / pandas
# -----------------------------------------------------------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["BLIS_NUM_THREADS"] = "1"

# -----------------------------------------------------------------------------
# 1) Detect environment
# -----------------------------------------------------------------------------
IN_COLAB = "google.colab" in sys.modules

# -----------------------------------------------------------------------------
# 2) (Colab only) Mount Google Drive
# -----------------------------------------------------------------------------
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    BASE_PATH = "/content/drive/MyDrive/Colab Notebooks"
else:
    # Server base path (your target)
    BASE_PATH = "/home/jovyan/work/hpool1/pseidel/test"

print("IN_COLAB:", IN_COLAB)
print("BASE_PATH:", BASE_PATH)

# -----------------------------------------------------------------------------
# 3) Sanity checks: path exists + write permission
# -----------------------------------------------------------------------------
BASE = Path(BASE_PATH)
if not BASE.exists():
    raise FileNotFoundError(f"BASE_PATH does not exist: {BASE}")

# quick write test (fails fast if you don't have permissions)
test_file = BASE / ".write_test_tmp"
try:
    test_file.write_text("ok", encoding="utf-8")
    test_file.unlink()
except Exception as e:
    raise PermissionError(f"No write permission in {BASE}. Error: {e}")

# -----------------------------------------------------------------------------
# 4) Environment check: ensure required packages import cleanly
# -----------------------------------------------------------------------------
required_packages = ["numpy", "scipy", "pandas", "linearmodels", "xlsxwriter"]

for pkg in required_packages:
    print(f"Importing {pkg} ...")
    importlib.import_module(pkg)
    print(f"{pkg} OK")

import numpy as np
import pandas as pd

# -----------------------------------------------------------------------------
# 5) Base paths and input/output locations
# -----------------------------------------------------------------------------
Input_file_path   = str(BASE / "Input")
Temp_file_path    = str(BASE / "Temp")
Output_file_path  = str(BASE / "Output")

Fundamentals_file_path = f"{Input_file_path}/WSFV_f_20250131.txt"
Current_file_path      = f"{Input_file_path}/WSCurrent_f_20250131.txt"
Calendar_file_path     = f"{Input_file_path}/WSCalendarPrd_f_20250131.txt"
Meta_file_path         = f"{Input_file_path}/WSMetaData_f_20250131.txt"
Excel_file_path        = f"{Input_file_path}/WS PIT Table Definitions V5 with start dates.xls"

MarketValues_file_path          = f"{Input_file_path}/Daily MV USD"
MarketValues_file_path_LC       = f"{Input_file_path}/Daily MV LC"
DailyTotalReturns_file_path     = f"{Input_file_path}/Daily Returns USD"
DailyIndexReturns_file_path     = f"{Input_file_path}/Daily Index Returns USD"
Constituents_file_path          = f"{Input_file_path}/Constituents.01.csv"
UniversalMatching_file_path     = f"{Input_file_path}/Universal Matching File"

Temp_file_path_GO  = f"{Temp_file_path}/TempGeneralOverview"
Temp_file_path_EoC = f"{Temp_file_path}/TempExtractionofCharacteristics"
Temp_file_path_DP  = f"{Temp_file_path}/TempDataPreparation"
Temp_file_path_A   = f"{Temp_file_path}/TempAnomalies"
Temp_file_path_R   = f"{Temp_file_path}/TempRegressionModel"

Relevant_items_path   = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_A = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_B = f"{Input_file_path}/RelevantItemsB.txt"
Relevant_items_path_C = f"{Input_file_path}/RelevantItemsC.txt"
Relevant_items_path_D = f"{Input_file_path}/RelevantItemsD.txt"

Subset_file_path = f"{Temp_file_path_GO}/Subsets"
Fundamentals_clean_file_path = f"{Temp_file_path_GO}/Fundamentals_clean.txt"
Current_clean_file_path      = f"{Temp_file_path_GO}/Current_clean.txt"
Calendar_clean_file_path     = f"{Temp_file_path_GO}/Input/Calendar_clean.txt"
Meta_clean_file_path         = f"{Temp_file_path_GO}/Input/Meta_clean.txt"

# -----------------------------------------------------------------------------
# 6) Ensure required directories exist
# -----------------------------------------------------------------------------
Path(Output_file_path).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_GO).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_EoC).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_A).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_R).mkdir(parents=True, exist_ok=True)
Path(Subset_file_path).mkdir(parents=True, exist_ok=True)
Path(Path(Calendar_clean_file_path).parent).mkdir(parents=True, exist_ok=True)

# -----------------------------------------------------------------------------
# 7) Streaming / deduplication settings
# -----------------------------------------------------------------------------
CHUNK_SIZE = 2_000_000
DATE_COL = "PIT Date"
DEDUP_KEYS = ["ID", "ItemCode", DATE_COL]

print("Paths configured. Temp outputs ->", Temp_file_path_GO)
print("Example input path ->", Fundamentals_file_path)


IN_COLAB: False
BASE_PATH: /home/jovyan/work/hpool1/pseidel/test
Importing numpy ...
numpy OK
Importing scipy ...
scipy OK
Importing pandas ...
pandas OK
Importing linearmodels ...
linearmodels OK
Importing xlsxwriter ...
xlsxwriter OK
Paths configured. Temp outputs -> /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview
Example input path -> /home/jovyan/work/hpool1/pseidel/test/Input/WSFV_f_20250131.txt


### Import Data Files to DataFrames

In [55]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell:
#
#   1. Defines a helper function `import_file_to_dataframe` that reads a pipe-delimited
#      text file into a pandas DataFrame (all columns as string; returns None on error).
#   2. Imports a list of "input" files from Input_file_path into DataFrames
#      (RelevantItems, CountryCodes, ...), storing them in globals() by filename.
#   3. Imports a list of "temp" files from Temp_file_path_EoC into DataFrames
#      (ADR_clean, CompanyName_clean, CurrencyCodes_clean, FYE_clean, ID_clean,
#       UpdateCodes_clean, ValueCoding), also stored in globals().
#   4. Identifies which subset_*.txt files exist in Subset_file_path based on the IDs
#      listed in RelevantItems.txt, and records their names (without .txt) in
#      `successful_subset_names`.
#
# No actual subset data is loaded here; that is deferred to later steps to keep
# memory usage under control.


# Function to import a file and return a pandas DataFrame
def import_file_to_dataframe(file_path):
    """
    Import a pipe-separated text file as a pandas DataFrame.

    - Uses sep='|' to read pipe-delimited files.
    - Reads all columns as strings (dtype=str), which helps preserve things like
      leading zeros in codes (e.g., NatCo, ItemCode).
    - Returns None on failure and prints a brief error message.
    """
    try:
        df = pd.read_csv(file_path, sep='|', dtype=str)
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error importing file {file_path}: {e}")
        return None


# -------------------------------------------------------------------------
# Import files from Input directory
# -------------------------------------------------------------------------
input_files_to_import = ["RelevantItems.txt", "CountryCodes.txt"]

for file_name in input_files_to_import:
    file_path = os.path.join(Input_file_path, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "RelevantItems"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Import files from Temp directory (end-of-cleaning stage)
# -------------------------------------------------------------------------
temp_files_to_import = [
    "ADR_clean.txt",
    "CompanyName_clean.txt",
    "CurrencyCodes_clean.txt",
    "FYE_clean.txt",
    "ID_clean.txt",
    "UpdateCodes_clean.txt",
    "ValueCoding.txt"
]

for file_name in temp_files_to_import:
    file_path = os.path.join(Temp_file_path_EoC, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "ADR_clean"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Identify subset files that exist for the relevant items
# -------------------------------------------------------------------------
successful_subset_names = []

if 'RelevantItems' in globals() and RelevantItems is not None:
    # Assume first column of RelevantItems holds the item IDs used in subset filenames
    relevant_ids = RelevantItems.iloc[:, 0].astype(str).tolist()

    print("\nIdentifying subset files to process...")
    for item_id in relevant_ids:
        file_name = f"subset_{item_id}.txt"
        file_path = os.path.join(Subset_file_path, file_name)

        # Check the existence of each candidate subset file
        if os.path.exists(file_path):
            successful_subset_names.append(f"subset_{item_id}")
            print(f"  Found {file_name}")
        else:
            print(f"  File not found: {file_name}. Skipping.")

    print(f"\nIdentified {len(successful_subset_names)} subset files for processing.")
else:
    print("RelevantItems DataFrame not found or is empty. Cannot identify subset files.")

# Note: actual loading and processing of subset files happens later, in
# batch-based steps, to manage memory usage.



Imported RelevantItems.txt as DataFrame 'RelevantItems'
Preview of 'RelevantItems':
  ItemCode
0    01001
1    01051
2    01075
3    01101
4    01151 


Imported CountryCodes.txt as DataFrame 'CountryCodes'
Preview of 'CountryCodes':
  NatCo ImplCountry
0   012     Algeria
1   440   Lithuania
2   025   Argentina
3   442  Luxembourg
4   036   Australia 


Imported ADR_clean.txt as DataFrame 'ADR_clean'
Preview of 'ADR_clean':
          ID ADRIndicator
0  C036F63D0            N
1  C056879S0            X
2  C2461T100            N
3  C2504O500            N
4  C250C9180            N 


Imported CompanyName_clean.txt as DataFrame 'CompanyName_clean'
Preview of 'CompanyName_clean':
          ID                               CompanyName
0  C00948205             AGRIFORCE GROWING SYSTEMS LTD
1  C02500770            PEUGEOT CITROEN ARGENTINA S.A.
2  C02520200  ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA
3  C02520220                       ALPARGATAS S.A.I.C.
4  C02520230               ALUAR ALUMINI

# 4.0. Extracting the most recent, annualized values per PIT Date (incl. Plausibility checks for the data)

## 4.1. Split according to source

In [56]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell processes a ValueCoding DataFrame and assigns a Category to each item
# (per sanitized item name), based on its data sources:
#
#   1. Validates that `ValueCoding` exists and is non-empty.
#   2. Sanitizes `ItemName` to a filesystem-safe `ItemName_Sanitized` (same rules as
#      used for filenames).
#   3. Normalizes the `Source` column (string type, trimmed).
#   4. Groups all distinct sources per `ItemName_Sanitized`.
#   5. Uses `decide_category` to map each sanitized name to a Category:
#        - Hardcoded overrides for certain items.
#        - Generic rules:
#             * presence of IS / Other  -> "Mixed"
#             * presence of Market / BS -> "Annualized"
#             * presence of CFS         -> "Special"
#        - otherwise                    -> None
#   6. Attaches the Category back to each row based on `ItemName_Sanitized`.
#   7. Creates three unique-item DataFrames:
#        - `annualized_items`
#        - `mixed_items`
#        - `special_items`
#   8. Exposes the processed objects in `globals()` for use in later cells.
#   9. Shows a sample and prints counts of each category.
#
# If `ValueCoding` is not present or is empty, processing is skipped.

# CELL 1 — Process ValueCoding and assign Category per ItemName_Sanitized

if 'ValueCoding' in globals() and ValueCoding is not None and not ValueCoding.empty:
    # Inform that processing of ValueCoding is starting
    print("Processing ValueCoding DataFrame...")

    # Work on a copy to avoid mutating the original ValueCoding
    value_coding_processed = ValueCoding.copy()

    # --- Sanitize ItemName ---
    # Ensure ItemName is treated as string (avoid issues with numbers / NaNs)
    value_coding_processed['ItemName'] = value_coding_processed['ItemName'].astype(str)

    # First pass: replace spaces and certain filesystem-unsafe characters with underscores
    # Same rule set as used for building filenames elsewhere
    value_coding_processed['ItemName_Sanitized'] = value_coding_processed['ItemName'].str.replace(
        r'[ \-/\:\\*\?"<>|]', '_', regex=True
    )
    # Second pass: strip any remaining characters not in [word chars, dot, hyphen]
    value_coding_processed['ItemName_Sanitized'] = value_coding_processed['ItemName_Sanitized'].str.replace(
        r'[^\w.-]', '', regex=True
    )

    # --- Normalize Source ---
    # Convert Source to string and strip leading/trailing whitespace
    value_coding_processed['Source'] = (
        value_coding_processed['Source']
        .astype(str)
        .str.strip()
    )

    # ------------------------------------------------------------------
    # Group all sources per sanitized name
    # ------------------------------------------------------------------
    # For each ItemName_Sanitized, collect the set of all non-null sources
    sources_per_name = (
        value_coding_processed
        .groupby('ItemName_Sanitized')['Source']
        .apply(lambda s: set(s.dropna()))
        .to_dict()
    )

    # ------------------------------------------------------------------
    # Helper to decide category per sanitized name
    # ------------------------------------------------------------------
    def decide_category(name, sources: set):
        """
        Decide a category string ("Mixed", "Annualized", "Special", or None)
        for a given sanitized item name based on its set of sources.
        """
        # Item-specific overrides (these take precedence over generic rules)
        if name == 'Depreciation_Depletion__Amortization':
            # Prefer 'IS' interpretation -> treat as Mixed
            return 'Mixed'
        if name == 'Minority_Interest':
            # Prefer 'BS' interpretation -> treat as Annualized
            return 'Annualized'

        # Generic rules:

        # If any of the sources is Income Statement or "Other", classify as Mixed
        if any(src in ['IS', 'Market'] for src in sources):
            return 'Mixed'

        # If any of the sources is Market or Balance Sheet, classify as Annualized
        if any(src in ['BS'] for src in sources):
            return 'Annualized'

        # If any of the sources is Cash Flow Statement, classify as Special
        if any(src in ['CFS'] for src in sources):
            return 'Special'

        # If none of the rules matched, leave as None (no clear mapping)
        return None

    # ------------------------------------------------------------------
    # Build category_map for all sanitized names
    # ------------------------------------------------------------------
    # Map each sanitized item name to a Category by applying decide_category
    category_map = {
        name: decide_category(name, srcs)
        for name, srcs in sources_per_name.items()
    }

    # Attach final Category back to each row, via ItemName_Sanitized
    value_coding_processed['Category'] = (
        value_coding_processed['ItemName_Sanitized'].map(category_map)
    )

    # ------------------------------------------------------------------
    # Derive annualized_items / mixed_items / special_items
    # as unique per sanitized name
    # ------------------------------------------------------------------
    annualized_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Annualized']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )
    mixed_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Mixed']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )
    special_items = (
        value_coding_processed[value_coding_processed['Category'] == 'Special']
        .drop_duplicates(subset=['ItemName_Sanitized'])
        .copy()
    )

    # ------------------------------------------------------------------
    # Export to globals for use in later cells
    # ------------------------------------------------------------------
    globals()['value_coding_processed'] = value_coding_processed
    globals()['annualized_items'] = annualized_items
    globals()['mixed_items'] = mixed_items
    globals()['special_items'] = special_items
    globals()['category_map'] = category_map

    # ------------------------------------------------------------------
    # Display sample and counts
    # ------------------------------------------------------------------
    print("\nProcessed ValueCoding DataFrame (sample):")
    display(value_coding_processed.head())

    print(f"\nNumber of Annualized items: {len(annualized_items)}")
    print(f"Number of Mixed items: {len(mixed_items)}")
    print(f"Number of Special items: {len(special_items)}")

else:
    # If ValueCoding is not available or has no rows, skip processing
    print("ValueCoding DataFrame not found or is empty. Skipping processing.")


Processing ValueCoding DataFrame...

Processed ValueCoding DataFrame (sample):


Unnamed: 0,ItemCode,ItemName,Source,ItemName_Sanitized,Category
0,5006,Market Price Current,Market,Market_Price_Current,Mixed
1,5007,Market Price YTD High Current,Market,Market_Price_YTD_High_Current,Mixed
2,5008,Market Price YTD Low Current,Market,Market_Price_YTD_Low_Current,Mixed
3,5009,Date of Current Price,Market,Date_of_Current_Price,Mixed
4,5091,Market Price 52 Week High Current,Market,Market_Price_52_Week_High_Current,Mixed



Number of Annualized items: 148
Number of Mixed items: 141
Number of Special items: 62


### Sort into correct bucket

In [57]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# CELL 2 maps work_subset_*.txt files to categories ("Annualized", "Mixed", "Special")
# based on the ItemName_Sanitized that was derived in the previous cell.
#
# Steps:
#   1. Check that the required categorized DataFrames (annualized_items, mixed_items,
#      special_items) and the temporary directory path Temp_file_path_DP exist.
#   2. Build three sets of sanitized item names (annualized_names, mixed_names,
#      special_names) from those DataFrames.
#   3. List all files in Temp_file_path_DP and filter for those matching
#      "work_subset_*.txt".
#   4. For each work_subset file:
#        - Extract the sanitized item name from the filename.
#        - Determine whether it belongs to Mixed, Annualized, or Special based on
#          the sets created in step 2.
#        - Assign it a variable name (Mixed_n, Annualized_n, Special_n) and store
#          that mapping in dicts mixed_vars, annualized_vars, special_vars.
#   5. Store these dicts in globals() for use in later cells.
#   6. Print summary information and display the created dictionaries.
#   7. Perform garbage collection at the end.
#
# If any of the prerequisites are missing, it prints a message and skips the mapping.


# CELL 2 — Map work_subset files to categories using ItemName_Sanitized

if ('annualized_items' not in globals() or annualized_items is None or
    'mixed_items' not in globals() or mixed_items is None or
    'special_items' not in globals() or special_items is None or
    'Temp_file_path_DP' not in globals()):
    # If required DataFrames or directory path are missing, do not proceed
    print("Required DataFrames (annualized_items, mixed_items, special_items) or Temp_file_path_DP not found. Please run the categorization cell.")
else:
    print("Identifying work_subset files and creating variables based on categories...")

    # Sets of sanitized names that are final Annualized/Mixed/Special
    annualized_names = set(annualized_items['ItemName_Sanitized'].dropna())
    mixed_names      = set(mixed_items['ItemName_Sanitized'].dropna())
    special_names    = set(special_items['ItemName_Sanitized'].dropna())

    # ------------------------------------------------------------------
    # Get a list of work_subset files from the temp directory
    # ------------------------------------------------------------------
    temp_files = os.listdir(Temp_file_path_DP)
    work_subset_files = [
        f for f in temp_files
        if f.startswith('work_subset_') and f.endswith('.txt')
    ]

    # Dictionaries to hold mappings:
    #   "Annualized_1" -> "SomeItemName"
    #   "Mixed_1"      -> "AnotherItemName"
    #   "Special_1"    -> "SpecialItemName"
    annualized_vars = {}
    mixed_vars = {}
    special_vars = {}

    print(f"\nFound {len(work_subset_files)} work_subset files in Temp directory.")

    # Sort files to have deterministic order when assigning variable names
    work_subset_files.sort()

    # Counters for how many items fall into each category; used for variable suffixes
    annualized_count = 0
    mixed_count = 0
    special_count = 0

    # ------------------------------------------------------------------
    # Iterate over each work_subset file and map it to a category
    # ------------------------------------------------------------------
    for file_name in work_subset_files:
        # Extract sanitized item name from filename, expecting "work_subset_<name>.txt"
        match = re.match(r'work_subset_(.+)\.txt$', file_name)
        if not match:
            print(f"  Filename format not as expected for '{file_name}'. Skipping processing.")
            continue

        sanitized_item_name = match.group(1)

        # Use the resolved sets. No more ambiguous precedence:
        # priority Mixed -> Annualized -> Special, in this order of checks.
        if sanitized_item_name in mixed_names:
            mixed_count += 1
            var_name = f"Mixed_{mixed_count}"
            mixed_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Mixed (variable '{var_name}').")

        elif sanitized_item_name in annualized_names:
            annualized_count += 1
            var_name = f"Annualized_{annualized_count}"
            annualized_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Annualized (variable '{var_name}').")

        elif sanitized_item_name in special_names:
            special_count += 1
            var_name = f"Special_{special_count}"
            special_vars[var_name] = sanitized_item_name
            print(f"  '{file_name}' -> Special (variable '{var_name}').")

        else:
            # No category mapping found for this sanitized name
            print(f"  '{file_name}' -> No matching Category (might be unmapped or ambiguous). Skipping.")

    # ------------------------------------------------------------------
    # Expose the mapping dictionaries globally for later use
    # ------------------------------------------------------------------
    globals()['annualized_vars'] = annualized_vars
    globals()['mixed_vars'] = mixed_vars
    globals()['special_vars'] = special_vars

    # ------------------------------------------------------------------
    # Summary output and inspection
    # ------------------------------------------------------------------
    print(f"\nVariable creation complete.")
    print(f"Created {len(annualized_vars)} Annualized variables.")
    print(f"Created {len(mixed_vars)} Mixed variables.")
    print(f"Created {len(special_vars)} Special variables.")

    print("\nAnnualized Variables:")
    display(annualized_vars)

    print("\nMixed Variables:")
    display(mixed_vars)

    print("\nSpecial Variables:")
    display(special_vars)

    # Run garbage collection after building mappings
    gc.collect()


Identifying work_subset files and creating variables based on categories...

Found 49 work_subset files in Temp directory.
  'work_subset_Accounts_Payable.txt' -> Annualized (variable 'Annualized_1').
  'work_subset_Capital_Expenditures_Addtns_to_Fixed_Assets.txt' -> Special (variable 'Special_1').
  'work_subset_Cash_Dividends_Paid___Total.txt' -> Special (variable 'Special_2').
  'work_subset_Cash__Short_Term_Investments.txt' -> Annualized (variable 'Annualized_2').
  'work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc..txt' -> Special (variable 'Special_3').
  'work_subset_Common_Equity.txt' -> Annualized (variable 'Annualized_3').
  'work_subset_Cost_of_Goods_Sold_Excl_Depreciation.txt' -> Mixed (variable 'Mixed_1').
  'work_subset_Current_Assets___Total.txt' -> Annualized (variable 'Annualized_4').
  'work_subset_Current_Liabilities___Total.txt' -> Annualized (variable 'Annualized_5').
  'work_subset_Deferred_Taxes.txt' -> Annualized (variable 'Annualized_6').
  'work_subset_Depre

{'Annualized_1': 'Accounts_Payable',
 'Annualized_2': 'Cash__Short_Term_Investments',
 'Annualized_3': 'Common_Equity',
 'Annualized_4': 'Current_Assets___Total',
 'Annualized_5': 'Current_Liabilities___Total',
 'Annualized_6': 'Deferred_Taxes',
 'Annualized_7': 'Income_Taxes_Payable',
 'Annualized_8': 'Inventories___Total',
 'Annualized_9': 'Investments_in_Associated_Companies',
 'Annualized_10': 'Investments_in_Sales__Direct_Financing_Leases',
 'Annualized_11': 'Long_Term_Debt',
 'Annualized_12': 'Long_Term_Receivables',
 'Annualized_13': 'Minority_Interest',
 'Annualized_14': 'Other_Assets___Total',
 'Annualized_15': 'Other_Current_Assets',
 'Annualized_16': 'Other_Current_Liabilities',
 'Annualized_17': 'Other_Investments',
 'Annualized_18': 'Other_Liabilities',
 'Annualized_19': 'Preferred_Stock',
 'Annualized_20': 'Property_Plant__Equipment___Net',
 'Annualized_21': 'ReceivablesNet',
 'Annualized_22': 'Short_Term_Debt__Current_Portion_of_LT_Debt',
 'Annualized_23': 'Total_Assets'


Mixed Variables:


{'Mixed_1': 'Cost_of_Goods_Sold_Excl_Depreciation',
 'Mixed_2': 'Depreciation_Depletion__Amortization',
 'Mixed_3': 'Earnings_Per_Share_Fiscal_Year_End',
 'Mixed_4': 'Income_Taxes',
 'Mixed_5': 'Interest_Expense___Total',
 'Mixed_6': 'Net_Income_Before_Extra_Items_Preferred_Divs',
 'Mixed_7': 'Net_Income_Used_to_Calculate_Basic_EPS',
 'Mixed_8': 'Net_Sales_or_Revenues',
 'Mixed_9': 'Operating_Income',
 'Mixed_10': 'Sales_Per_Share',
 'Mixed_11': 'Selling_General__Administrative_Expenses'}


Special Variables:


{'Special_1': 'Capital_Expenditures_Addtns_to_Fixed_Assets',
 'Special_2': 'Cash_Dividends_Paid___Total',
 'Special_3': 'Com_Pfd_Redeemed_Retired_Converted_Etc.',
 'Special_4': 'Disposal_of_Fixed_Assets',
 'Special_5': 'Extraordinary_Items',
 'Special_6': 'Funds_From_For_Other_Operating_Activities',
 'Special_7': 'Funds_From_Operations',
 'Special_8': 'Long_Term_Borrowings',
 'Special_9': 'Net_Cash_Flow___Financing',
 'Special_10': 'Net_Cash_Flow___Investing',
 'Special_11': 'Net_Cash_Flow___Operating_Activities',
 'Special_12': 'Net_Proceeds_From_Sale_Issue_of_Com__Pfd',
 'Special_13': 'Reduction_in_Long_Term_Debt'}

## 4.2. Income Statement

### Mixed 1

#### Set Index

In [58]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 1  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_1  ->  ItemName: 'Cost_of_Goods_Sold_Excl_Depreciation'


#### Import relevant data



In [59]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Cost_of_Goods_Sold_Excl_Depreciation' ...
Full dataset loaded successfully: 4,227,003 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1051,1415.675687
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1051,1621.318491
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1051,1968.910031
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1051,1270.982588
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1051,1007.403728


0

#### Encode Frequency Code (Check of output required!)

In [60]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Cost_of_Goods_Sold_Excl_Depreciation' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1051,1415.675687,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1051,1621.318491,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1051,1968.910031,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1051,1270.982588,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1051,1007.403728,Y96


#### Annualize data with most recent information (Check of output required!)

In [61]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 4,227,003 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 2488370
                mean: 22087.72593423967
              median: 100.0
winsorized_mean_1pct: 102.20628914622627
                 p10: 99.14078087541672
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 103.0348917912802

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 70,143

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2418227
                mean: 101.12530173470029
        

### Mixed 2

#### Set Index

In [62]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 2  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_2  ->  ItemName: 'Depreciation_Depletion__Amortization'


#### Import relevant data



In [63]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Depreciation_Depletion__Amortization' ...


Full dataset loaded successfully: 3,390,222 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1151,38.094417
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1151,68.245694
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1151,89.54678
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1151,98.504938
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1151,84.007512


3

#### Encode Frequency Code (Check of output required!)

In [64]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Depreciation_Depletion__Amortization' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1151,38.094417,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1151,68.245694,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1151,89.54678,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1151,98.504938,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1151,84.007512,Y96


#### Annualize data with most recent information (Check of output required!)

In [65]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 3,390,222 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 2062861
                mean: 22266.512812640318
              median: 100.0
winsorized_mean_1pct: 100.66014210935954
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 37,553

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2025308
                mean: 100.40068385378902
              median: 100.0
win

### Mixed 3

#### Set Index

In [66]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 3  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_3  ->  ItemName: 'Earnings_Per_Share_Fiscal_Year_End'


#### Import relevant data



In [67]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Earnings_Per_Share_Fiscal_Year_End' ...
Full dataset loaded successfully: 8,799,912 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,5202,0.09
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,5202,0.08
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,5202,0.1
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,5202,-0.04
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,5202,0.03


0

#### Encode Frequency Code (Check of output required!)

In [68]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Earnings_Per_Share_Fiscal_Year_End' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,5202,0.09,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,5202,0.08,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,5202,0.1,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,5202,-0.04,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,5202,0.03,Y96


#### Annualize data with most recent information (Check of output required!)

In [69]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 8,799,912 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 6394790
                mean: 7176573831.922597
              median: 100.0
winsorized_mean_1pct: 149.79110460094466
                 p10: 33.774834437086085
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.75566750629723
                 p80: 121.15601310083875
                 p90: 200.0

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 1,320,622

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 5074168
                mean: 107.66009

### Mixed 4

#### Set Index

In [70]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 4  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_4  ->  ItemName: 'Income_Taxes'


#### Import relevant data



In [71]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Income_Taxes' ...
Full dataset loaded successfully: 3,556,775 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1451,24.615657
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1451,20.007064
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1451,26.888541
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1451,4.741
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1451,2.526497


0

#### Encode Frequency Code (Check of output required!)

In [72]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Income_Taxes' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1451,24.615657,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1451,20.007064,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1451,26.888541,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1451,4.741,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1451,2.526497,Y96


#### Annualize data with most recent information (Check of output required!)

In [73]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 3,556,775 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 1834011
                mean: 24484.88774335648
              median: 100.0
winsorized_mean_1pct: 98.53536041208515
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 51,482

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1782529
                mean: 100.26270636271235
              median: 100.0
winso

### Mixed 5

#### Set Index

In [74]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 5  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_5  ->  ItemName: 'Interest_Expense___Total'


#### Import relevant data



In [75]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Interest_Expense___Total' ...
Full dataset loaded successfully: 121,029 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1075,11.207
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1075,15.753
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1075,13.255
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1075,30.678
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1075,32.078


0

#### Encode Frequency Code (Check of output required!)

In [76]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Interest_Expense___Total' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1075,11.207,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1075,15.753,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1075,13.255,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1075,30.678,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1075,32.078,Y96


#### Annualize data with most recent information (Check of output required!)

In [77]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 121,029 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 113141
                mean: 104.68247919935635
              median: 100.0
winsorized_mean_1pct: 99.96134819807163
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 518

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 112623
                mean: 100.01269112887455
              median: 100.0
winsorized_

### Mixed 6

#### Set Index

In [78]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 6  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_6  ->  ItemName: 'Net_Income_Before_Extra_Items_Preferred_Divs'


#### Import relevant data



In [79]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Net_Income_Before_Extra_Items_Preferred_Divs' ...
Full dataset loaded successfully: 3,906,799 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1551,97.90298
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1551,92.230294
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1551,110.555887
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1551,-43.448501
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1551,39.188143


0

#### Encode Frequency Code (Check of output required!)

In [80]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Net_Income_Before_Extra_Items_Preferred_Divs' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1551,97.90298,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1551,92.230294,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1551,110.555887,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1551,-43.448501,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1551,39.188143,Y96


#### Annualize data with most recent information (Check of output required!)

In [81]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 3,906,799 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 2307432
                mean: 28322.45151943236
              median: 100.0
winsorized_mean_1pct: 99.55537956717451
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 40,188

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2267244
                mean: 100.18154532626606
              median: 100.0
winso

### Mixed 7

#### Set Index

In [82]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 7  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_7  ->  ItemName: 'Net_Income_Used_to_Calculate_Basic_EPS'


#### Import relevant data



In [83]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Net_Income_Used_to_Calculate_Basic_EPS' ...
Full dataset loaded successfully: 3,750,698 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1706,97.90298
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1706,92.230294
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1706,110.555887
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1706,-43.448501
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1706,39.188143


0

#### Encode Frequency Code (Check of output required!)

In [84]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Net_Income_Used_to_Calculate_Basic_EPS' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1706,97.90298,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1706,92.230294,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1706,110.555887,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1706,-43.448501,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1706,39.188143,Y96


#### Annualize data with most recent information (Check of output required!)

In [85]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 3,750,698 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 2187307
                mean: 27694.772770478314
              median: 100.0
winsorized_mean_1pct: 99.52074516226975
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 38,759

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2148548
                mean: 100.18606378381752
              median: 100.0
wins

### Mixed 8

#### Set Index

In [86]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 8  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_8  ->  ItemName: 'Net_Sales_or_Revenues'


#### Import relevant data



In [87]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Net_Sales_or_Revenues' ...
Full dataset loaded successfully: 4,003,129 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1001,1707.334371
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1001,1921.517802
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1001,2263.705199
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1001,1447.33116
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1001,1217.952697


0

#### Encode Frequency Code (Check of output required!)

In [88]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Net_Sales_or_Revenues' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1001,1707.334371,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1001,1921.517802,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1001,2263.705199,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1001,1447.33116,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1001,1217.952697,Y96


#### Annualize data with most recent information (Check of output required!)

In [89]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 4,003,129 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 2275935
                mean: 28875.64133107392
              median: 100.0
winsorized_mean_1pct: 100.10577968502807
                 p10: 100.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 100.0

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 29,212

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2246723
                mean: 100.32657752247285
              median: 100.0
wins

### Mixed 9

#### Set Index

In [90]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 9  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_9  ->  ItemName: 'Operating_Income'


#### Import relevant data



In [91]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Operating_Income' ...
Full dataset loaded successfully: 4,253,913 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1250,81.946988
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1250,42.404318
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1250,75.373553
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1250,-59.739189
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1250,27.980425


0

#### Encode Frequency Code (Check of output required!)

In [92]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Operating_Income' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1250,81.946988,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1250,42.404318,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1250,75.373553,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1250,-59.739189,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1250,27.980425,Y96


#### Annualize data with most recent information (Check of output required!)

In [93]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 4,253,913 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 2589547
                mean: 23775.823858645716
              median: 100.0
winsorized_mean_1pct: 99.75021033963792
                 p10: 97.67441860465115
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 101.70941367530939

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 103,696

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 2485851
                mean: 100.59589215405154
      

### Mixed 10

#### Set Index

In [94]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 10  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_10  ->  ItemName: 'Sales_Per_Share'


#### Import relevant data



In [95]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Sales_Per_Share' ...
Full dataset loaded successfully: 1,929,643 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1992,December,5508,15.29247
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1993,December,5508,17.2111
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1994,December,5508,20.27582
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1995,December,5508,12.96369
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1996,December,5508,10.90916


0

#### Encode Frequency Code (Check of output required!)

In [96]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Sales_Per_Share' ...


  df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1992,December,5508,15.29247,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1993,December,5508,17.2111,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1994,December,5508,20.27582,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1995,December,5508,12.96369,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1999-10-01,A,,1996,December,5508,10.90916,Y96


#### Annualize data with most recent information (Check of output required!)

In [97]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 1,929,643 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 1890306
                mean: 523942.5563862871
              median: 100.0
winsorized_mean_1pct: 166.41964216695058
                 p10: 50.0
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 101.50428163399192
                 p80: 125.85865654948871
                 p90: 214.822120682574

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 399,754

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1490552
                mean: 107.2684367996

### Mixed 11

#### Set Index

In [98]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This code selects which Mixed_* dataset should be processed by choosing an index
# (e.g., Mixed_1, Mixed_2, ...). It then:
#
#   1. Ensures that a dictionary `mixed_vars` exists, mapping keys like "Mixed_1"
#      to item names.
#   2. Builds the key corresponding to the selected index and retrieves the
#      associated item name (`target_item_name`).
#   3. Prints which Mixed_* item was selected.
#   4. Constructs paths and filenames based on global variables and the selected item.
#   5. Ensures that the output directory exists by creating it if necessary.
#
# The goal is to centralize selection of a single Mixed_* dataset and prepare paths
# for downstream processing.


# === Select which Mixed_* item to run ===
mixed_index = 11  # Change this to process another dataset (e.g., 10)

# Validate that the dictionary of mixed item names exists
assert 'mixed_vars' in globals(), "mixed_vars dict not found in globals()."

# Build the key (e.g., "Mixed_1") and retrieve the associated item name
item_key = f"Mixed_{mixed_index}"
target_item_name = mixed_vars.get(item_key)
assert target_item_name, f"{item_key} not found in mixed_vars."

# Inform which item was selected
print(f"Selected: {item_key}  ->  ItemName: '{target_item_name}'")

# === Paths (reusing globals) ===
assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."

# Construct the name of the input file for the selected item
file_name = f"work_subset_{target_item_name}.txt"
file_path = os.path.join(Temp_file_path_DP, file_name)

# Construct the base name for output files (will later be suffixed)
base_output_filename = f"Mixed_{target_item_name}_complete"

# Ensure the output directory exists; create it (including parent dirs) if needed
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)


Selected: Mixed_11  ->  ItemName: 'Selling_General__Administrative_Expenses'


#### Import relevant data



In [99]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block:
#   1. Announces the import of a full dataset for the given `target_item_name`.
#   2. Checks whether the file at `file_path` exists.
#   3. If it exists, calls `import_file_to_dataframe(file_path)` to load the data
#      into `mixed_raw`.
#   4. If the loaded DataFrame is non-empty, prints a success message including
#      the number of rows and shows the first few rows (via display or fallback
#      to text printing).
#   5. If the load fails or returns an empty DataFrame, prints a warning and
#      creates an empty DataFrame.
#   6. If the file does not exist, prints an error message and sets `mixed_raw`
#      to an empty DataFrame.
#   7. Finally, it runs `gc.collect()` to trigger garbage collection and free
#      memory.
#
# Note: All previous emoji symbols in the print statements have been removed.

# Inform the user that we are starting the import for this item
print(f"\nImporting full dataset for Item: '{target_item_name}' ...")

# Check if the file path exists on disk
if os.path.exists(file_path):
    # If the file exists, attempt to import it into a DataFrame
    mixed_raw = import_file_to_dataframe(file_path)

    # If the DataFrame is successfully loaded and not empty
    if mixed_raw is not None and not mixed_raw.empty:
        # Print a success message including row count with thousands separator
        print(f"Full dataset loaded successfully: {len(mixed_raw):,} rows total.")

        try:
            # Try to display the first few rows (Jupyter / IPython display)
            display(mixed_raw.head())
        except Exception:
            # If display is not available, fall back to printing as plain text
            print(mixed_raw.head().to_string(index=False))
    else:
        # If DataFrame is None or empty, warn and create an empty DataFrame
        print("Dataset appears empty or could not be loaded.")
        mixed_raw = pd.DataFrame()
else:
    # If the file does not exist, report an error and set mixed_raw to empty
    print(f"File not found: {file_path}")
    mixed_raw = pd.DataFrame()

# Run garbage collection to free up memory after the load attempt
gc.collect()



Importing full dataset for Item: 'Selling_General__Administrative_Expenses' ...
Full dataset loaded successfully: 3,078,865 rows total.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1101,171.617279
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1101,189.549299
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1101,129.874835
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1101,137.582823
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1101,98.561032


0

#### Encode Frequency Code (Check of output required!)

In [100]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This snippet provides:
#
# 1. A helper function `last2` that returns the last two digits of a number as a
#    zero-padded string (for building YY strings).
#
# 2. A function `add_str_fiscalprd(df)` which:
#    - Works on a copy of an input DataFrame containing financial periods.
#    - Normalizes the 'Frequency' (upper-case, no missing).
#    - Stores the original 'FiscalPeriod' and converts it to numeric.
#    - Creates a string representation 'Str_FiscalPrd' depending on the frequency:
#         - Q/C/E/R: quarter-based ("QnYyy")
#         - A/B: annual ("Yyy")
#         - F/S: semiannual ("SnYyy")
#         - K/T/L/U: trimester-like ("TnYyy")
#    - Derives an implied full-year integer 'ImplFiscPer_Calculated' from the
#      two-digit year (80–99 => 19xx, else 20xx).
#    - For annual rows (A/B), checks discrepancies between original
#      'FiscalPeriod' and implied full-year; prints a small preview & total count.
#    - Overwrites 'FiscalPeriod' with 'ImplFiscPer_Calculated' and drops helper
#      columns.
#
# 3. A small driver block that:
#    - Checks that `mixed_raw` exists and is non-empty.
#    - Applies `add_str_fiscalprd` to produce `mixed_encoded`.
#    - Displays a head preview or prints a message and sets `mixed_encoded = None`
#      if input is missing/empty.

def last2(n):
    """Return last two digits as zero-padded string, or None if NaN."""
    # If input is NaN (or pandas-style missing), return None (no digits)
    if pd.isna(n):
        return None
    # Convert to int, format as 4-digit zero-padded string, take last 2 chars
    # Example: n=2023 -> "2023"[-2:] -> "23"
    return f"{int(n):04d}"[-2:]


def add_str_fiscalprd(df):
    """
    Creates 'Str_FiscalPrd' and overwrites 'FiscalPeriod' with an implied full year.
    Also prints discrepancies for Annual (A,B) rows between original and calculated year.

    The mapping logic:
      - Quarterly (C, Q, E, R):   fp encodes a quarter index -> "Q{1-4}Y{yy}"
      - Annual   (A, B):         fp is the year itself      -> "Y{yy}"
      - Semiannual (F, S):       fp encodes half-year       -> "S{1-2}Y{yy}"
      - Trimester-like (K, T,
         L, U):                  fp encodes trimester       -> "T{1-3}Y{yy}"
    Then we re-infer the full year from the YY part with 19xx/20xx rule.
    """
    # Work on a copy to avoid mutating the original input DataFrame
    df = df.copy()

    # Normalize frequency codes:
    # - uppercase for consistency
    # - fill missing values with empty string
    df["Frequency"] = df["Frequency"].str.upper().fillna("")

    # Store original FiscalPeriod in a separate column for later comparison
    df['Original_FiscalPeriod'] = df['FiscalPeriod']

    # Convert FiscalPeriod to numeric, coerce errors to NaN
    fp = pd.to_numeric(df["FiscalPeriod"], errors="coerce")

    # Masks for different frequency groups
    m_quarter = df["Frequency"].isin(["C", "Q", "E", "R"])  # quarter-based
    m_AB      = df["Frequency"].isin(["A", "B"])            # annual
    m_FS      = df["Frequency"].isin(["F", "S"])            # semiannual
    m_KTLU    = df["Frequency"].isin(["K", "T", "L", "U"])  # trimester-like

    # Initialize Str_FiscalPrd as NaN; we will fill per frequency group
    df["Str_FiscalPrd"] = np.nan

    # -------------------------
    # Quarter-based encoding
    # -------------------------
    # Quarter number: (fp % 4) + 1 => yields 1..4 (if fp is integer-based encoding)
    q_part = ((fp % 4) + 1).where(m_quarter)
    # Year part for quarter: integer division by 4, then take last two digits
    q_year = (fp // 4).where(m_quarter).apply(last2)
    # Assign "Q{quarter}Y{yy}" for quarter frequencies
    df.loc[m_quarter, "Str_FiscalPrd"] = (
        "Q" +
        q_part.astype("Int64").astype(str) +
        "Y" +
        q_year.fillna('')
    )

    # -------------------------
    # Annual encoding (A, B)
    # -------------------------
    # Year part is fp itself for A/B; take last two digits via last2
    ab_year = fp.where(m_AB).apply(last2)
    # Assign "Y{yy}" for annual frequencies
    df.loc[m_AB, "Str_FiscalPrd"] = "Y" + ab_year.fillna('')

    # -------------------------
    # Semiannual encoding (F, S)
    # -------------------------
    # Semester number: (fp % 2) + 1 => 1 or 2
    fs_sem  = ((fp % 2) + 1).where(m_FS)
    # Year part: fp // 2, then last two digits
    fs_year = (fp // 2).where(m_FS).apply(last2)
    # Assign "S{sem}Y{yy}" for F/S
    df.loc[m_FS, "Str_FiscalPrd"] = (
        "S" +
        fs_sem.astype("Int64").astype(str) +
        "Y" +
        fs_year.fillna('')
    )

    # -------------------------
    # Trimester-like encoding (K, T, L, U)
    # -------------------------
    # Term number: (fp % 3) + 1 => 1..3
    t_term  = ((fp % 3) + 1).where(m_KTLU)
    # Year part: fp // 3, then last two digits
    t_year  = (fp // 3).where(m_KTLU).apply(last2)
    # Assign "T{term}Y{yy}" for K/T/L/U frequencies
    df.loc[m_KTLU, "Str_FiscalPrd"] = (
        "T" +
        t_term.astype("Int64").astype(str) +
        "Y" +
        t_year.fillna('')
    )

    # ---------------------------------------------------------------------
    # Implied full year from Str_FiscalPrd (19xx / 20xx reconstruction)
    # ---------------------------------------------------------------------
    # Extract two-digit year from "Yyy" pattern, e.g., "Q1Y23" -> "23"
    year_part = df['Str_FiscalPrd'].str.extract(r'Y(\d{2})', expand=False)
    # Convert that to numeric (NaN if not parseable)
    year_numeric = pd.to_numeric(year_part, errors='coerce')

    # Map two-digit year to full year:
    #  - 80–99 => 19xx
    #  - else  => 20xx
    df['ImplFiscPer_Calculated'] = year_numeric.apply(
        lambda x: int(f"19{int(x):02d}") if pd.notna(x) and x >= 80
        else (int(f"20{int(x):02d}") if pd.notna(x) else np.nan)
    )

    # ---------------------------------------------------------------------
    # Discrepancies check for Annual frequencies (A, B)
    # ---------------------------------------------------------------------
    # Work on a subset of annual frequencies only
    annual_rows_for_check = df[m_AB].copy()

    # Build boolean mask where implied full year does NOT match the original
    discrepancy_mask_annual = ~(
        # Either they match numerically...
        (annual_rows_for_check['ImplFiscPer_Calculated'] ==
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce'))
        |
        # ...or both are NaN
        (annual_rows_for_check['ImplFiscPer_Calculated'].isna() &
         pd.to_numeric(annual_rows_for_check['Original_FiscalPeriod'], errors='coerce').isna())
    )

    # Subset rows with discrepancies
    discrepancy_rows = annual_rows_for_check[discrepancy_mask_annual].copy()

    # If we have any, show a short sample and the total count
    if not discrepancy_rows.empty:
        print("\nDiscrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies:")
        display(
            discrepancy_rows[
                ['ID', 'Frequency', 'Original_FiscalPeriod', 'Str_FiscalPrd', 'ImplFiscPer_Calculated']
            ].head()
        )
        print(f"Total discrepancies found for Annual frequencies: {len(discrepancy_rows)}")
    else:
        print("\nNo discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.")

    # ---------------------------------------------------------------------
    # Overwrite FiscalPeriod and drop temporary helper columns
    # ---------------------------------------------------------------------
    # Replace 'FiscalPeriod' with the implied full-year value we just calculated
    df['FiscalPeriod'] = df['ImplFiscPer_Calculated']
    # Drop helper columns not needed downstream
    df.drop(columns=['Original_FiscalPeriod', 'ImplFiscPer_Calculated'], inplace=True)

    # Return the modified DataFrame
    return df


# =============================================================================
# Driver: apply encoding to mixed_raw if present and non-empty
# =============================================================================
if 'mixed_raw' in globals() and mixed_raw is not None and not mixed_raw.empty:
    # Inform which item we are encoding for
    print(f"Applying encoding to Mixed dataset for '{target_item_name}' ...")
    # Apply fiscal period encoding
    mixed_encoded = add_str_fiscalprd(mixed_raw)
    # Show a preview of the encoded data
    display(mixed_encoded.head())
else:
    # If mixed_raw is missing or empty, we skip and mark mixed_encoded as None
    print("mixed_raw not found or empty. Cannot perform encoding.")
    mixed_encoded = None


Applying encoding to Mixed dataset for 'Selling_General__Administrative_Expenses' ...


  df.loc[m_quarter, "Str_FiscalPrd"] = (



No discrepancies found between original FiscalPeriod and calculated ImplFiscPer for Annual (A, B) frequencies.


Unnamed: 0,ID,CompanyName,ImplCountry,CurrentCurrency,HistCurrency,PIT Date,Frequency,UpdateCode,FiscalPeriod,FYE Month,ItemCode,Value,Str_FiscalPrd
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1992,December,1101,171.617279,Y92
1,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1993,December,1101,189.549299,Y93
2,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1995-12-29,A,3,1994,December,1101,129.874835,Y94
3,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1996-05-03,A,3,1995,December,1101,137.582823,Y95
4,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,Argentina,Ars,Ars,1998-07-03,A,3,1996,December,1101,98.561032,Y96


#### Annualize data with most recent information (Check of output required!)

In [101]:
# @title
# =====================================================================================
# SUMMARY
# =====================================================================================
# This script takes an input DataFrame `mixed_encoded` (if present in the global scope)
# that contains financial time-series data (per company, item, currency, fiscal period,
# and PIT Date). It then:
#
# 1. Cleans and standardizes key columns (dates, numeric types, string IDs).
# 2. Excludes rows with certain frequencies (E/L/R/U).
# 3. Parses fiscal period strings into quarter/semester/trimester indicators (QNUM/SNUM/TNUM).
# 4. Uses a custom, vectorized "as-of" join (`asof_numpy`) to attach the most recent
#    annual, quarterly, semiannual, and trimester values for each (ID, HistCurrency,
#    ItemCode, FiscalPeriod) up to each row’s PIT Date.
# 5. Builds "full-year" candidate values from:
#       - actual annuals (A),
#       - sum of Q1..Q4 (Q4 proxy),
#       - sum of S1..S2 (S2 proxy),
#       - sum of T1..T3 (T3 proxy),
#    and selects the best candidate based on priorities and relationship to the row’s
#    fiscal period (same-year vs prior-year).
# 6. Computes an annual PIT-based metric `AnnPITValue` and compares it to the “true”
#    annual value (`TrueValue`) to derive a percentage `AnnPITValue_Pct` for QC.
# 7. Performs quality checks:
#       - Ensures no period-date is after the PIT Date.
#       - Drops rows whose `AnnPITValue_Pct` is outside the range [50, 200] or infinite.
# 8. Keeps a curated set of columns, drops helper columns, and saves:
#       - a full output file
#       - a subset file with key columns for quick inspection.
# 9. Prints row-accounting stats and frees some memory.
#
# If `mixed_encoded` is not defined or is None, it simply prints a message and exits.

import pandas as pd
import numpy as np
import os
import gc
from datetime import timedelta            # <--- Added to fix NameError
from scipy.stats.mstats import winsorize  # <--- Added to fix NameError

# Enable pandas "copy-on-write" behavior to reduce unintended chained assignment effects
pd.options.mode.copy_on_write = True

# ---------- Helper: fast as-of (right.PIT <= left.PIT) ----------

def _key(fr, cols):
    """
    Helper function to build a string key from multiple columns.
    For each row, join the values of 'cols' with '||'.
    """
    return fr[cols].astype(str).agg('||'.join, axis=1)


def asof_numpy(left_df: pd.DataFrame, right_df: pd.DataFrame, by_cols: list[str]):
    """
    For each row in left_df, find the latest (as-of) Value from right_df
    with matching by_cols and right_df['PIT Date'] <= left_df['PIT Date'].

    This is a manual, NumPy-based implementation of an "as-of merge" grouped by `by_cols`.
    It:
    - filters out rows with missing required fields,
    - normalizes PIT Date to daily precision,
    - groups right_df by composite key of `by_cols`,
    - for each left row, binary-searches into the matching right group by PIT Date,
      picking the last date <= the left PIT Date,
    - returns two aligned arrays: (values, dates) for the left_df rows.
    """
    # Initialize outputs with NaNs and NaT for all left_df rows
    out_vals  = np.full(len(left_df), np.nan, dtype='float64')
    out_dates = np.full(len(left_df), 'NaT', dtype='datetime64[ns]')

    # Required columns on each side
    left_req  = by_cols + ['PIT Date']
    right_req = by_cols + ['PIT Date', 'Value']

    # Mask: rows that have all required fields non-null
    lmask = left_df[left_req].notna().all(axis=1)
    rmask = right_df[right_req].notna().all(axis=1)

    # If either side has no valid rows, return the default empty outputs
    if not lmask.any() or not rmask.any():
        return out_vals, out_dates

    # Work on filtered copies only (avoid side effects)
    l = left_df.loc[lmask, left_req].copy()
    r = right_df.loc[rmask, right_req].copy()

    # Normalize PIT Date columns to datetime at day precision
    l['PIT Date'] = pd.to_datetime(l['PIT Date'], errors='coerce').dt.floor('D')
    r['PIT Date'] = pd.to_datetime(r['PIT Date'], errors='coerce').dt.floor('D')

    # Build composite keys for group-level match (based on by_cols)
    l['__k'] = _key(l, by_cols)
    r['__k'] = _key(r, by_cols)

    # Sort right side by key and date to enable binary search per key
    r = r.sort_values(['__k', 'PIT Date']).reset_index(drop=True)

    # Extract NumPy arrays for right side
    rk   = r['__k'].to_numpy()
    rdt  = r['PIT Date'].to_numpy()
    rval = r['Value'].to_numpy()

    # Find unique keys and first index for each key in the sorted right side
    uniq, first = np.unique(rk, return_index=True)

    # Build a dict: key -> (dates array, values array) slice
    slices = {}
    for i, k in enumerate(uniq):
        s = first[i]  # start of this key
        e = first[i + 1] if i + 1 < len(first) else len(r)  # end of this key
        slices[k] = (rdt[s:e], rval[s:e])

    # Left side indices and arrays
    l_idx = l.index.to_numpy()
    lk    = l['__k'].to_numpy()
    ldt   = l['PIT Date'].to_numpy()

    # Sort left side by key (stable mergesort to preserve original row order within key)
    order = np.argsort(lk, kind='mergesort')
    sk, sd, sp = lk[order], ldt[order], l_idx[order]

    # i iterates over the sorted left rows
    i = 0
    n = len(sk)
    while i < n:
        k = sk[i]  # current composite key
        j = i + 1
        # Find contiguous block [i:j) for this key
        while j < n and sk[j] == k:
            j += 1

        # Dates and original positions of this key’s left rows
        block_dates = sd[i:j]
        block_pos   = sp[i:j]

        # If we have matching right-hand slices for this key, do the as-of search
        if k in slices:
            r_dates, r_vals = slices[k]
            # searchsorted(..., 'right') - 1 gives index of last r_date <= block_date
            pos   = np.searchsorted(r_dates, block_dates, side='right') - 1
            valid = pos >= 0  # positions where such a date exists
            if np.any(valid):
                # Fill outputs for left rows where we found a valid match
                out_vals[block_pos[valid]]  = r_vals[pos[valid]]
                out_dates[block_pos[valid]] = r_dates[pos[valid]]
        # Move to next key block
        i = j

    return out_vals, out_dates


# ---------- Small helpers ----------

def pctile(s, q):
    """
    Safe percentile calculation: returns quantile q of Series `s`,
    or NaN if quantile fails.
    """
    try:
        return s.quantile(q, interpolation='linear')
    except Exception:
        return np.nan


def summarize_pct(series: pd.Series):
    """
    Computes a summary of finite values of a series:
      - count
      - mean, median
      - 1% winsorized mean
      - selected deciles (p10, p20, ..., p90)
    Infinite values are treated as NaN and removed.
    If no finite values remain, returns an empty dict.
    """
    # Replace +/-inf with NaN and drop all NaNs
    s = series.replace([np.inf, -np.inf], np.nan).dropna()
    if s.empty:
        return {}
    
    # FIX APPLIED HERE: .to_numpy().copy() ensures the array is writable for winsorize
    w_mean = winsorize(s.to_numpy().copy(), limits=[0.01, 0.01]).mean()
    
    return {
        "finite_rows": len(s),
        "mean": s.mean(),
        "median": s.median(),
        "winsorized_mean_1pct": w_mean,
        "p10": pctile(s, 0.10), "p20": pctile(s, 0.20), "p30": pctile(s, 0.30),
        "p40": pctile(s, 0.40), "p50": pctile(s, 0.50), "p60": pctile(s, 0.60),
        "p70": pctile(s, 0.70), "p80": pctile(s, 0.80), "p90": pctile(s, 0.90)
    }


# ---------- Priority for full-year candidates ----------

# Fixed priority mapping for full-year candidates:
#   'A'  : actual annual value
#   'Q4' : annual proxy from four quarters
#   'T3' : annual proxy from three trimesters
#   'S2' : annual proxy from two semesters
_PERIOD_PRIORITY = {
    'A': 100,  # highest priority: actual annual
    'Q4': 90,  # then Q1+Q2+Q3+Q4
    'T3': 80,  # then T1+T2+T3
    'S2': 70,  # then S1+S2
}

# ============================ MAIN ============================

# Only run the main logic if a global DataFrame `mixed_encoded` exists and is not None
if 'mixed_encoded' in globals() and mixed_encoded is not None:
    # Count initial input rows
    input_rows = len(mixed_encoded)
    print(f"Input dataset contains {input_rows:,} rows before processing.\n")

    # Work on a copy of the input dataset
    working = mixed_encoded.copy()

    # -------------------------------------------------------------------------
    # Exclude certain frequencies (E/L/R/U)
    # -------------------------------------------------------------------------
    # Create mask of rows whose Frequency is one of E, L, R, U (case-insensitive)
    excl_mask = working['Frequency'].astype(str).str.upper().isin(['E', 'L', 'R', 'U'])
    # Count how many rows will be excluded
    excluded_rows = int(excl_mask.sum())
    # Keep only rows that are NOT in the exclusion set
    working = working.loc[~excl_mask].copy()

    # -------------------------------------------------------------------------
    # Set dtypes and normalize important columns
    # -------------------------------------------------------------------------
    # Convert PIT Date to datetime (coerce errors -> NaT), floor to day
    working['PIT Date']     = pd.to_datetime(working['PIT Date'], errors='coerce').dt.floor('D')
    # FiscalPeriod: numeric (e.g., 2021, 2022, ...)
    working['FiscalPeriod'] = pd.to_numeric(working['FiscalPeriod'], errors='coerce')
    # Value: numeric (float)
    working['Value']        = pd.to_numeric(working['Value'], errors='coerce')

    # Convert key ID / code columns to string to ensure consistency
    for c in ['ID', 'HistCurrency', 'ItemCode', 'Frequency', 'Str_FiscalPrd']:
        if c in working.columns:
            working[c] = working[c].astype(str)

    # -------------------------------------------------------------------------
    # Parse Q/S/T markers from Str_FiscalPrd (like 'Q1Y2023')
    # -------------------------------------------------------------------------
    # Extract quarter number Q1..Q4 from e.g. "Q1Y2023" into QNUM
    working['QNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^Q([1-4])Y', expand=False),
        errors='coerce'
    )
    # Extract semiannual number S1..S2 into SNUM
    working['SNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^S([1-2])Y', expand=False),
        errors='coerce'
    )
    # Extract trimester number T1..T3 into TNUM
    working['TNUM'] = pd.to_numeric(
        working['Str_FiscalPrd'].str.extract(r'^T([1-3])Y', expand=False),
        errors='coerce'
    )

    # -------------------------------------------------------------------------
    # Ensure period columns exist (Q1..Q4, S1..S2, T1..T3, A + their date cols)
    # -------------------------------------------------------------------------
    # Create value columns for Q1..Q4, S1..S2, T1..T3, A if they are missing
    for c in [*(f'Q{i}' for i in range(1, 5)),
              *(f'S{i}' for i in range(1, 3)),
              *(f'T{i}' for i in range(1, 4)),
              'A']:
        if c not in working.columns:
            working[c] = np.nan

    # Create corresponding *_Date columns if missing
    for c in [*(f'Q{i}_Date' for i in range(1, 5)),
              *(f'S{i}_Date' for i in range(1, 3)),
              *(f'T{i}_Date' for i in range(1, 4)),
              'A_Date']:
        if c not in working.columns:
            working[c] = pd.NaT

    # Base key for many of the as-of mappings
    base_keys = ['ID', 'HistCurrency', 'ItemCode', 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 1) Derive TrueValue from annuals (A/B frequencies)
    # -------------------------------------------------------------------------
    # Mask annual-like rows where Value is present
    mask_annual = working['Frequency'].isin(['A', 'B']) & working['Value'].notna()
    # annual_src: one row per (ID, FiscalPeriod, HistCurrency) with last PIT Date
    annual_src = (
        working.loc[mask_annual,
                    ['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date', 'Value']]
        .sort_values(['ID', 'FiscalPeriod', 'HistCurrency', 'PIT Date'])
        .drop_duplicates(['ID', 'FiscalPeriod', 'HistCurrency'], keep='last')
        .rename(columns={'Value': 'TrueValue', 'PIT Date': 'TrueValue_Date'})
    )
    # Left-join true annual value back onto working
    working = working.merge(
        annual_src,
        on=['ID', 'FiscalPeriod', 'HistCurrency'],
        how='left'
    )

    # -------------------------------------------------------------------------
    # 2) As-of mapping (same FiscalPeriod) for A/Q/S/T
    # -------------------------------------------------------------------------

    # ----- Annual -----
    # Source rows for annual frequencies A/B
    src_A = working.loc[
        working['Frequency'].isin(['A', 'B']) & working['Value'].notna(),
        base_keys + ['PIT Date', 'Value']
    ].copy()
    # As-of join: for each working row, get most recent annual value by PIT Date
    vA, dA = asof_numpy(working, src_A, by_cols=base_keys)
    working['A'], working['A_Date'] = vA, dA
    # Origin fiscal period of annual value (same as row's FiscalPeriod when present)
    working['A_OriginFP'] = np.where(
        working['A'].notna(), working['FiscalPeriod'], np.nan
    )

    # ----- Quarterly -----
    # Source rows for quarterly frequencies (Q/C) with valid QNUM
    src_Q = working.loc[
        working['Frequency'].isin(['Q', 'C']) & working['QNUM'].notna(),
        base_keys + ['QNUM', 'PIT Date', 'Value']
    ].copy()
    for q in (1, 2, 3, 4):
        # Restrict to a specific quarter q
        rv = src_Q[src_Q['QNUM'] == q].drop(columns=['QNUM'])
        # As-of join for that quarter
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'Q{q}', f'Q{q}_Date'
        working[col], working[dcol] = v, d
        # Origin FP column for that quarter
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        # Fill origin FP only where quarter value is non-null and origin not yet set
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Semiannual -----
    # Source rows for semiannual frequencies (S/F) with valid SNUM
    src_S = working.loc[
        working['Frequency'].isin(['S', 'F']) & working['SNUM'].notna(),
        base_keys + ['SNUM', 'PIT Date', 'Value']
    ].copy()
    for s in (1, 2):
        rv = src_S[src_S['SNUM'] == s].drop(columns=['SNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'S{s}', f'S{s}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # ----- Trimester -----
    # Source rows for trimester frequencies (T/K) with valid TNUM
    src_T = working.loc[
        working['Frequency'].isin(['T', 'K']) & working['TNUM'].notna(),
        base_keys + ['TNUM', 'PIT Date', 'Value']
    ].copy()
    for t in (1, 2, 3):
        rv = src_T[src_T['TNUM'] == t].drop(columns=['TNUM'])
        v, d = asof_numpy(working, rv, by_cols=base_keys)
        col, dcol = f'T{t}', f'T{t}_Date'
        working[col], working[dcol] = v, d
        ocol = f'{col}_OriginFP'
        if ocol not in working.columns:
            working[ocol] = np.nan
        m = working[col].notna() & working[ocol].isna()
        working.loc[m, ocol] = working.loc[m, 'FiscalPeriod']

    # -------------------------------------------------------------------------
    # 3) Prepare labels & normalize dates (NO prev-year fill, NO forward-fill)
    # -------------------------------------------------------------------------
    # Sort working data consistently for downstream calculations
    working = working.sort_values(['ID', 'HistCurrency', 'FiscalPeriod', 'PIT Date'])

    # List of all period value columns
    value_cols_all  = [f'Q{i}' for i in range(1, 5)] + \
                      [f'S{i}' for i in range(1, 3)] + \
                      [f'T{i}' for i in range(1, 4)] + ['A']
    # Corresponding date columns
    date_cols_all   = [f'{c}_Date' for c in value_cols_all]
    # Corresponding origin FP columns
    origin_cols_all = [f'{c}_OriginFP' for c in value_cols_all]

    # Ensure that all date columns are proper datetimes (floor to day)
    # Note: explicitly no groupby-forward-fill here – only asof-filled values remain
    for c in date_cols_all:
        if c in working.columns:
            working[c] = pd.to_datetime(working[c], errors='coerce').dt.floor('D')

    # -------------------------------------------------------------------------
    # 4) Build full-year candidates from fixed sets (Q1–Q4, S1–S2, T1–T3)
    # -------------------------------------------------------------------------
    def full_year_from_fixed(row, labels, pit, cutoff):
        """
        Fixed full-year from given labels (e.g. Q1..Q4, S1..S2, T1..T3):

          - Requires ALL labels to have:
              value, date, origin_fp.
          - All dates must be within [cutoff, pit].
          - origin_fp = max(origin_fp_i)  (the 'newest year' across components).

        Returns (dt, val, origin_fp) or (NaT, NaN, NaN) if any requirement fails.
        """
        vals, dts, fps = [], [], []
        for lbl in labels:
            v = row.get(lbl, np.nan)
            d = row.get(f'{lbl}_Date', pd.NaT)
            o = row.get(f'{lbl}_OriginFP', np.nan)

            # Require non-null for value, date, origin FP
            if pd.isna(v) or pd.isna(d) or pd.isna(o):
                return pd.NaT, np.nan, np.nan

            # Normalize date and check it lies within [cutoff, pit]
            d = pd.to_datetime(d, errors='coerce')
            if pd.isna(d) or not (cutoff <= d <= pit):
                return pd.NaT, np.nan, np.nan

            vals.append(float(v))
            dts.append(d)
            fps.append(int(o))

        # Sum all component values, pick latest date and max origin FP
        total_val = float(np.nansum(vals))
        latest_dt = max(dts)
        origin_fp = max(fps)
        return latest_dt, total_val, origin_fp

    def pick_annpit_sum_with_origin(row):
        """
        For each row, pick the best annual PIT-based value (AnnPITValue) by:

        1) Building a time window [PIT - 365 days, PIT].
        2) Generating candidate annual values:
           - actual annual A (if within window and with origin FP),
           - Q4 proxy from Q1..Q4,
           - S2 proxy from S1..S2,
           - T3 proxy from T1..T3,
           each with associated priority and origin FP.
        3) Ranking candidates according to:
           - Same-year A (highest),
           - Same-year proxies (Q4/T3/S2),
           - Prior-year A,
           - Prior-year proxies,
           - Otherwise: any remaining candidate by (priority, latest date).
        4) Returning the chosen candidate's value (keeping zeros, but filtering NaNs).
        """
        pit = row['PIT Date']
        if pd.isna(pit):
            return np.nan
        cutoff = pit - timedelta(days=365)  # Now works because timedelta is imported

        # Current row's fiscal period, cast to int if possible
        fp = row.get('FiscalPeriod', np.nan)
        try:
            fp_int = int(fp) if not pd.isna(fp) else None
        except Exception:
            fp_int = None

        # Collect candidate tuples: (label, priority, date, value, origin_fp)
        candidates = []

        # --- Candidate A: actual annual (0 is allowed)
        A_val = row.get('A', np.nan)
        A_dt  = row.get('A_Date', pd.NaT)
        A_ofp = row.get('A_OriginFP', np.nan)
        if pd.notna(A_val) and pd.notna(A_dt) and not pd.isna(A_ofp):
            A_dt = pd.to_datetime(A_dt, errors='coerce')
            if pd.notna(A_dt) and (cutoff <= A_dt <= pit):
                candidates.append(('A', _PERIOD_PRIORITY['A'], A_dt, float(A_val), int(A_ofp)))

        # --- Candidate Q4: sum of Q1..Q4 (0 allowed)
        q4_dt, q4_val, q4_fp = full_year_from_fixed(
            row, [f'Q{i}' for i in range(1, 5)], pit, cutoff
        )
        if pd.notna(q4_val) and pd.notna(q4_dt) and not pd.isna(q4_fp):
            candidates.append(('Q4', _PERIOD_PRIORITY['Q4'], q4_dt, float(q4_val), int(q4_fp)))

        # --- Candidate S2: sum of S1..S2 (0 allowed)
        s2_dt, s2_val, s2_fp = full_year_from_fixed(
            row, [f'S{i}' for i in range(1, 3)], pit, cutoff
        )
        if pd.notna(s2_val) and pd.notna(s2_dt) and not pd.isna(s2_fp):
            candidates.append(('S2', _PERIOD_PRIORITY['S2'], s2_dt, float(s2_val), int(s2_fp)))

        # --- Candidate T3: sum of T1..T3 (0 allowed)
        t3_dt, t3_val, t3_fp = full_year_from_fixed(
            row, [f'T{i}' for i in range(1, 4)], pit, cutoff
        )
        if pd.notna(t3_val) and pd.notna(t3_dt) and not pd.isna(t3_fp):
            candidates.append(('T3', _PERIOD_PRIORITY['T3'], t3_dt, float(t3_val), int(t3_fp)))

        # If no candidates, return NaN
        if not candidates:
            return np.nan

        # Filter out NaN-valued candidates (keep 0-valued ones)
        def valid(seq):
            return [c for c in seq if not np.isnan(c[3])]

        # Relationship between row's fiscal period and candidate's origin FP:
        # same / prior / other / unknown
        def rel(c):
            _, _, _, _, ofp = c
            if fp_int is None or ofp is None:
                return 'unknown'
            if ofp == fp_int:
                return 'same'
            if ofp == fp_int - 1:
                return 'prior'
            return 'other'

        # 1) Same-year actual annual A (max by date)
        same_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'same')
        if same_A:
            best = max(same_A, key=lambda x: x[2])
            return best[3]

        # 2) Same-year proxies (Q4/T3/S2), max by (priority, date)
        same_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'same'
        )
        if same_proxies:
            best = max(same_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 3) Prior-year actual annual A
        prior_A = valid(c for c in candidates if c[0] == 'A' and rel(c) == 'prior')
        if prior_A:
            best = max(prior_A, key=lambda x: x[2])
            return best[3]

        # 4) Prior-year proxies (Q4/S2/T3) if no prior A
        prior_proxies = valid(
            c for c in candidates
            if c[0] in ('Q4', 'S2', 'T3') and rel(c) == 'prior'
        )
        if prior_proxies:
            best = max(prior_proxies, key=lambda x: (x[1], x[2]))
            return best[3]

        # 5) Fallback: any candidate (other/unknown) by (priority, date)
        others = valid(candidates)
        if others:
            best = max(others, key=lambda x: (x[1], x[2]))
            return best[3]

        # Final fallback: 0.0 (should rarely be reached)
        return 0.0

    # Apply the selection function row-wise to produce AnnPITValue
    working['AnnPITValue'] = working.apply(pick_annpit_sum_with_origin, axis=1)

    # -------------------------------------------------------------------------
    # 5) QC: Future-date check + PRE-DROP stats
    # -------------------------------------------------------------------------
    # Columns whose dates should not exceed PIT Date
    date_cols = [
        'A_Date',
        'Q1_Date', 'Q2_Date', 'Q3_Date', 'Q4_Date',
        'S1_Date', 'S2_Date',
        'T1_Date', 'T2_Date', 'T3_Date'
    ]
    # Restrict to ones actually present
    present = [c for c in date_cols if c in working.columns]

    viol_counts = {}  # per-label violation counts
    # Mask for rows with any future-dated period
    any_mask = pd.Series(False, index=working.index)

    for c in present:
        # A violation is when period date > PIT Date (both need to be non-null)
        m = (
            working[c].notna() &
            working['PIT Date'].notna() &
            (pd.to_datetime(working[c], errors='coerce') > working['PIT Date'])
        )
        viol_counts[c] = int(m.sum())
        any_mask |= m  # accumulate violations across columns

    total_future_viol = int(any_mask.sum())
    print("\n=== Future-date check (period dates > PIT Date) ===")
    print("Per-label violations:", viol_counts)
    print(f"Rows with ANY future-dated period value: {total_future_viol}")
    # Flagrows with at least one future-date error
    working['HasFutureDateError'] = any_mask

    # -------------------------------------------------------------------------
    # 6) AnnPITValue_Pct + quality drop
    # -------------------------------------------------------------------------
    # Compute AnnPITValue as % of TrueValue (only when TrueValue != 0)
    working['AnnPITValue_Pct'] = np.where(
        working['AnnPITValue'].notna() &
        working['TrueValue'].notna() &
        (working['TrueValue'] != 0),
        (working['AnnPITValue'] / working['TrueValue']) * 100,
        np.nan
    )

    # Stats before dropping low-quality rows
    pre_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===")
    for k, v in pre_stats.items():
        print(f"{k:>20}: {v}")

    # Build masks for dropping:
    pct = working['AnnPITValue_Pct']
    is_inf = np.isinf(pct)  # infinite percentages
    is_finite = np.isfinite(pct)
    # Out-of-range if % > 200 or % < 50 (but finite)
    out_of_range = is_finite & ((pct > 200) | (pct < 50))
    # Rows to drop: infinite or out-of-range values
    to_drop_quality = is_inf | out_of_range

    dropped_quality_rows = int(to_drop_quality.sum())
    print(f"\nRows to drop due to AnnPITValue_Pct (±inf or >200 or <50): {dropped_quality_rows:,}")

    # Keep only rows that passed the quality filter
    working = working.loc[~to_drop_quality].copy()

    # Stats after dropping
    post_stats = summarize_pct(working['AnnPITValue_Pct'])
    print("\n=== AnnPITValue_Pct summary — AFTER quality drop ===")
    if post_stats:
        for k, v in post_stats.items():
            print(f"{k:>20}: {v}")
    else:
        print("No finite values remain after the quality drop.")

    # -------------------------------------------------------------------------
    # 7) Final columns & save
    # -------------------------------------------------------------------------
    # Base descriptive columns to keep (if present)
    base_cols = [
        'ID', 'CompanyName', 'ImplCountry', 'CurrentCurrency', 'HistCurrency',
        'PIT Date', 'Frequency', 'UpdateCode', 'FiscalPeriod', 'FYE Month',
        'ItemCode', 'Value', 'Str_FiscalPrd'
    ]

    # Build ordered list of period date/value columns
    freq_cols = []
    for i in range(1, 5):
        freq_cols += [f'Q{i}_Date', f'Q{i}']
    for i in range(1, 3):
        freq_cols += [f'S{i}_Date', f'S{i}']
    for i in range(1, 4):
        freq_cols += [f'T{i}_Date', f'T{i}']
    freq_cols += ['A_Date', 'A']

    # Final set of columns to keep in output
    keep_cols = (
        [c for c in base_cols if c in working.columns] +
        ['TrueValue', 'AnnPITValue', 'AnnPITValue_Pct', 'HasFutureDateError'] +
        [c for c in freq_cols if c in working.columns]
    )

    # Helper columns to drop before export
    drop_cols = ['QNUM', 'SNUM', 'TNUM', 'TrueValue_Date']
    # Also drop all *_OriginFP columns
    drop_cols += [c for c in working.columns if c.endswith('_OriginFP')]
    working.drop(columns=[c for c in drop_cols if c in working.columns],
                 inplace=True, errors='ignore')

    # Reorder and restrict columns to the final layout
    mixed_processed = working.reindex(columns=keep_cols)

    # Sanity checks: necessary globals must exist
    assert 'Temp_file_path_DP' in globals(), "Temp_file_path_DP not found."
    assert 'base_output_filename' in globals(), "base_output_filename not found (set in Cell 0)."

    # Build full output path and save pipe-delimited file
    out_full = os.path.join(Temp_file_path_DP, f"{base_output_filename}.txt")
    mixed_processed.to_csv(out_full, sep='|', index=False)
    print("\nSaved full:", out_full)

    # Create a subset for lighter inspection
    subset_cols = ["ID", "PIT Date", "CompanyName", "HistCurrency",
                   "FiscalPeriod", "AnnPITValue"]
    # Only keep subset columns that actually exist
    subset_cols_existing = [col for col in subset_cols if col in mixed_processed.columns]
    subset_df = mixed_processed[subset_cols_existing].copy()
    out_subset = os.path.join(Temp_file_path_DP,
                              f"{base_output_filename}_subset.txt")
    subset_df.to_csv(out_subset, sep='|', index=False)
    print("Saved subset:", out_subset)
    # Explicitly delete subset_df to free memory
    del subset_df

    # -------------------------------------------------------------------------
    # 8) Row-accounting overview
    # -------------------------------------------------------------------------
    output_rows = len(mixed_processed)
    print("\n=== Row Accounting ===")
    print(f"Input rows:                     {input_rows:,}")
    print(f"Excludedby Frequency (E/L/R/U): {excluded_rows:,}")
    print(f"Dropped by quality (Pct rules): {dropped_quality_rows:,}")
    print(f"Output rows (final):            {output_rows:,}")
    # Sum up excluded + dropped + remaining and check against original count
    check_total = excluded_rows + dropped_quality_rows + output_rows
    print(f"Check: excluded + dropped + output = {check_total:,}")
    if check_total == input_rows:
        print("Row counts reconcile exactly.")
    else:
        print(f"Mismatch of {input_rows - check_total:+,} rows. "
              "Investigate upstream filtering or unexpected drops.")

    # Trigger garbage collection as a final cleanup step
    gc.collect()

else:
    # If the main input dataset is not available, skip all processing
    print("mixed_encoded not found or None; skipping.")

Input dataset contains 3,078,865 rows before processing.


=== Future-date check (period dates > PIT Date) ===
Per-label violations: {'A_Date': 0, 'Q1_Date': 0, 'Q2_Date': 0, 'Q3_Date': 0, 'Q4_Date': 0, 'S1_Date': 0, 'S2_Date': 0, 'T1_Date': 0, 'T2_Date': 0, 'T3_Date': 0}
Rows with ANY future-dated period value: 0

=== AnnPITValue_Pct summary (finite only) — BEFORE quality drop ===
         finite_rows: 1929431
                mean: 18990.9321407337
              median: 100.0
winsorized_mean_1pct: 100.04908826721622
                 p10: 99.30057821787052
                 p20: 100.0
                 p30: 100.0
                 p40: 100.0
                 p50: 100.0
                 p60: 100.0
                 p70: 100.0
                 p80: 100.0
                 p90: 101.93498320484426

Rows to drop due to AnnPITValue_Pct (±inf or >200 or <50): 54,319

=== AnnPITValue_Pct summary — AFTER quality drop ===
         finite_rows: 1875112
                mean: 100.34380370354451
        