### Mount Google Drive, Import Libraries and Define Paths

In [44]:
# =============================================================================
# ENVIRONMENT SETUP + PATH CONFIGURATION (SERVER / COLAB COMPATIBLE)
# =============================================================================

import os
import sys
import importlib
from pathlib import Path
import string
import re
import gc

# -----------------------------------------------------------------------------
# 0) HARD SAFETY: cap native thread usage (prevents pthread_create EAGAIN)
#    MUST be set before importing numpy / scipy / pandas
# -----------------------------------------------------------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["BLIS_NUM_THREADS"] = "1"

# -----------------------------------------------------------------------------
# 1) Detect environment
# -----------------------------------------------------------------------------
IN_COLAB = "google.colab" in sys.modules

# -----------------------------------------------------------------------------
# 2) (Colab only) Mount Google Drive
# -----------------------------------------------------------------------------
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    BASE_PATH = "/content/drive/MyDrive/Colab Notebooks"
else:
    # Server base path (your target)
    BASE_PATH = "/home/jovyan/work/hpool1/pseidel/test"

print("IN_COLAB:", IN_COLAB)
print("BASE_PATH:", BASE_PATH)

# -----------------------------------------------------------------------------
# 3) Sanity checks: path exists + write permission
# -----------------------------------------------------------------------------
BASE = Path(BASE_PATH)
if not BASE.exists():
    raise FileNotFoundError(f"BASE_PATH does not exist: {BASE}")

# quick write test (fails fast if you don't have permissions)
test_file = BASE / ".write_test_tmp"
try:
    test_file.write_text("ok", encoding="utf-8")
    test_file.unlink()
except Exception as e:
    raise PermissionError(f"No write permission in {BASE}. Error: {e}")

# -----------------------------------------------------------------------------
# 4) Environment check: ensure required packages import cleanly
# -----------------------------------------------------------------------------
required_packages = ["numpy", "scipy", "pandas", "linearmodels", "xlsxwriter"]

for pkg in required_packages:
    print(f"Importing {pkg} ...")
    importlib.import_module(pkg)
    print(f"{pkg} OK")

# -----------------------------------------------------------------------------
# 5) Base paths and input/output locations
# -----------------------------------------------------------------------------
Input_file_path   = str(BASE / "Input")
Temp_file_path    = str(BASE / "Temp")
Output_file_path  = str(BASE / "Output")

Fundamentals_file_path = f"{Input_file_path}/WSFV_f_20250131.txt"
Current_file_path      = f"{Input_file_path}/WSCurrent_f_20250131.txt"
Calendar_file_path     = f"{Input_file_path}/WSCalendarPrd_f_20250131.txt"
Meta_file_path         = f"{Input_file_path}/WSMetaData_f_20250131.txt"
Excel_file_path        = f"{Input_file_path}/WS PIT Table Definitions V5 with start dates.xls"

MarketValues_file_path          = f"{Input_file_path}/Daily MV USD"
MarketValues_file_path_LC       = f"{Input_file_path}/Daily MV LC"
DailyTotalReturns_file_path     = f"{Input_file_path}/Daily Returns USD"
DailyIndexReturns_file_path     = f"{Input_file_path}/Daily Index Returns USD"
Constituents_file_path          = f"{Input_file_path}/Constituents.01.csv"
UniversalMatching_file_path     = f"{Input_file_path}/Universal Matching File"

Temp_file_path_GO  = f"{Temp_file_path}/TempGeneralOverview"
Temp_file_path_EoC = f"{Temp_file_path}/TempExtractionofCharacteristics"
Temp_file_path_DP  = f"{Temp_file_path}/TempDataPreparation"
Temp_file_path_A   = f"{Temp_file_path}/TempAnomalies"
Temp_file_path_R   = f"{Temp_file_path}/TempRegressionModel"

Relevant_items_path   = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_A = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_B = f"{Input_file_path}/RelevantItemsB.txt"
Relevant_items_path_C = f"{Input_file_path}/RelevantItemsC.txt"
Relevant_items_path_D = f"{Input_file_path}/RelevantItemsD.txt"

Subset_file_path = f"{Temp_file_path_GO}/Subsets"
Fundamentals_clean_file_path = f"{Temp_file_path_GO}/Fundamentals_clean.txt"
Current_clean_file_path      = f"{Temp_file_path_GO}/Current_clean.txt"
Calendar_clean_file_path     = f"{Temp_file_path_GO}/Input/Calendar_clean.txt"
Meta_clean_file_path         = f"{Temp_file_path_GO}/Input/Meta_clean.txt"

# -----------------------------------------------------------------------------
# 6) Ensure required directories exist
# -----------------------------------------------------------------------------
Path(Output_file_path).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_GO).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_EoC).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_A).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_R).mkdir(parents=True, exist_ok=True)
Path(Subset_file_path).mkdir(parents=True, exist_ok=True)
Path(Path(Calendar_clean_file_path).parent).mkdir(parents=True, exist_ok=True)

# -----------------------------------------------------------------------------
# 7) Streaming / deduplication settings
# -----------------------------------------------------------------------------
CHUNK_SIZE = 2_000_000
DATE_COL = "PIT Date"
DEDUP_KEYS = ["ID", "ItemCode", DATE_COL]

print("Paths configured. Temp outputs ->", Temp_file_path_GO)
print("Example input path ->", Fundamentals_file_path)


IN_COLAB: False
BASE_PATH: /home/jovyan/work/hpool1/pseidel/test
Importing numpy ...
numpy OK
Importing scipy ...
scipy OK
Importing pandas ...
pandas OK
Importing linearmodels ...
linearmodels OK
Importing xlsxwriter ...
xlsxwriter OK
Paths configured. Temp outputs -> /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview
Example input path -> /home/jovyan/work/hpool1/pseidel/test/Input/WSFV_f_20250131.txt


# Worldscope PIT

## 1.0. Extraction of Relevant Data from the Different Input Files

### Clean Data

In [45]:
# =============================================================================
# IMPORT AND FILTER REQUIRED CODES FROM SUBSET FILES INTO FILTERED OUTPUTS
# =============================================================================
# This cell:
#   - Reads only a specific, predefined set of item codes from subset_XXXXX.txt
#     files in the Subset_file_path directory.
#   - Normalizes raw code strings to a consistent 5-digit numeric format so that
#     route definitions and file names align (e.g., "6027" vs "06027").
#   - For each required code, loads its corresponding subset file and:
#       * Detects whether the file has named columns (ID, PIT DATE, ItemCode,
#         Value, Frequency, FiscalPeriod) or uses positional columns.
#       * Selects and reorders the relevant columns into standardized 4-column
#         (Current/Meta-like) or 6-column (Calendar) structures.
#       * Optionally transforms the value column to title case for certain
#         attributes (e.g., Nation, EntityType) to standardize text formatting.
#   - Writes the cleaned, standardized data into filtered_*.txt files with
#     fixed headers in the Temp_file_path_EoC directory.
#   - Collects detailed per-code logging on success, missing subset files, or
#     errors, and reports summary statistics including rows written per code.
#
# Filtering / transformation focus:
#   - Filters the universe of available subset_XXXXX.txt files down to a
#     required_codes set derived from ROUTES and ROUTES_CALENDAR.
#   - For each loaded subset, either:
#       * Explicitly maps named columns (ID, PIT DATE, ItemCode, Value, etc.),
#         or
#       * Falls back to positional slicing of the first N columns to match
#         the expected 4- or 6-column output structure.
#   - Applies a text transformation (title casing) to selected value columns
#     when configured (do_title=True) to normalize categorical string values.
# =============================================================================

# High-level imports needed in this cell
import re
from pathlib import Path
import pandas as pd
import gc

# Subset_file_path: directory containing subset_XXXXX.txt files (one per code)
# Temp_file_path_EoC: directory where filtered_*.txt outputs will be written
subset_dir = Path(Subset_file_path)   # Base directory for subset input files
out_dir = Path(Temp_file_path_EoC)    # Output directory for filtered data
out_dir.mkdir(parents=True, exist_ok=True)  # Ensure output directory exists

def normalize_code(code: str) -> str:
    """
    Normalize a raw code to a 5-digit numeric string.
    - Strips whitespace.
    - Removes all non-digit characters.
    - If the remaining digits are 4 characters long, a leading '0' is added.
    - Otherwise returns the digits unchanged.
    """
    s = str(code).strip()                    # Convert to string and strip whitespace
    s_digits = re.sub(r"\D+", "", s)         # Keep only digits from the string
    if not s_digits:                         # If no digits found, return original string
        return s
    # Return normalized code:
    #   - exactly 5 digits if already length 5
    #   - prepend '0' if length 4
    #   - otherwise return as-is (e.g., longer codes)
    return s_digits if len(s_digits) == 5 else ("0" + s_digits if len(s_digits) == 4 else s_digits)

# --- ROUTES (Current/Meta-like; 4-column outputs) ---
# Each entry maps a code to:
#   (friendly output name, fixed header line, apply_title_case_to_value)
ROUTES = {
    "6027": ("NationCode",   "ID|PIT DATE|ItemCode|NationCode\n",   True),
    "6026": ("Nation",       "ID|PIT DATE|ItemCode|Nation\n",       True),
    "6001": ("CompanyName",  "ID|PIT DATE|ItemCode|Name\n",         False),
    "6100": ("EntityType",   "ID|PIT DATE|ItemCode|EntityType\n",   True),
    "11503":("ADRIndicator", "ID|PIT DATE|ItemCode|ADRIndicator\n", True),
    "56027":("CurrencyCode", "ID|PIT DATE|ItemCode|CurrencyCode\n", True),
    "5350": ("FYE",          "ID|PIT DATE|ItemCode|FYE\n",          True),
    "7021": ("SIC",          "ID|PIT DATE|ItemCode|SIC\n",          True),
}
# Normalize all route keys so that they are 5-digit codes
ROUTES = {normalize_code(k): v for k, v in ROUTES.items()}

# --- Calendar route (6-column output) ---
# Each entry maps a code to:
#   (friendly output name, fixed header line, apply_title_case_to_value)
ROUTES_CALENDAR = {
    "57034": ("UpdateCode", "ID|PIT Date|Frequency|FiscalPeriod|ItemCode|Value\n", True),
}
# Normalize calendar route keys as well
ROUTES_CALENDAR = {normalize_code(k): v for k, v in ROUTES_CALENDAR.items()}

# Combine all required codes from both four-column and calendar routes
required_codes = set(ROUTES.keys()) | set(ROUTES_CALENDAR.keys())

# Dictionaries to keep track of processing results and row counts per code
attempt_log = {}   # code -> {status, message, rows_out, outfile}
match_counts = {}  # code -> rows written

def _write_df_with_fixed_header(df: pd.DataFrame, cols_out: list, header_text: str,
                                out_path: Path, title_case_value: bool, value_col_name: str):
    """
    Write a DataFrame to a pipe-separated text file with:
      - A fixed header line (header_text) written manually.
      - A subset of columns (cols_out) written without column names.
    Optionally title-case the specified value column before writing.
    """
    df = df.copy()  # Work on a copy to avoid modifying the original DataFrame
    # If requested and the value column exists, convert its entries to title case
    if title_case_value and value_col_name in df.columns:
        df[value_col_name] = df[value_col_name].astype(str).map(
            lambda x: x.title() if x is not None else x
        )
    # First write the fixed header, then append data rows without header
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(header_text)
    df.to_csv(
        out_path,
        sep="|",
        index=False,
        header=False,
        mode="a",
        encoding="utf-8",
        lineterminator="\n",
        columns=cols_out,
    )

def _import_route_fourcol(code: str, friendly_name: str, header_text: str, do_title: bool):
    """
    Import a 4-column route (e.g., Current/Meta-like attributes).
    Supports input subset files with:
      - Named columns ID, PITDATE, ITEMCODE, VALUE, or
      - Generic positional columns (uses either 4 or 6 source columns).
    The function standardizes to a 4-column output and writes it to a
    filtered_<friendly_name>.txt file in out_dir.
    """
    subset_path = subset_dir / f"subset_{code}.txt"  # Expected input file path
    out_path = out_dir / f"filtered_{friendly_name}.txt"  # Output file path
    # If the subset file does not exist, log as missing and skip
    if not subset_path.exists():
        attempt_log[code] = {
            "status": "missing_subset",
            "message": f"Missing {subset_path}",
            "rows_out": 0,
            "outfile": str(out_path),
        }
        return
    try:
        # Read the subset file as a pipe-separated text file with all columns as strings
        df = pd.read_csv(subset_path, sep="|", dtype=str, engine="c", encoding="latin1")
        # Normalize column names by stripping spaces
        cols = [c.strip() for c in df.columns]
        # Build a mapping from normalized (lowercased, no spaces) names to original names
        colmap = {c.lower().replace(" ", ""): c for c in cols}
        # Check if the expected four logical columns are present
        if {"id", "pitdate", "itemcode", "value"}.issubset(colmap.keys()):
            # Use explicit named columns when available
            wanted = [
                colmap["id"],
                colmap["pitdate"],
                colmap["itemcode"],
                colmap["value"],
            ]
            _write_df_with_fixed_header(
                df,
                wanted,
                header_text,
                out_path,
                do_title,
                value_col_name=colmap["value"],
            )
            n = len(df)  # All rows are written out
        else:
            # Fallback: use positional selection when column names cannot be mapped
            if df.shape[1] >= 6:
                # If there are at least 6 columns, use columns [0,1,4,5]
                wanted_pos = [0, 1, 4, 5]
            else:
                # Otherwise, default to the first four columns
                wanted_pos = [0, 1, 2, 3]
            # Slice the DataFrame to keep only the selected columns
            df_pos = df.iloc[:, wanted_pos]
            valname = df_pos.columns[-1]  # Last column treated as value column
            _write_df_with_fixed_header(
                df_pos,
                list(df_pos.columns),
                header_text,
                out_path,
                do_title,
                value_col_name=valname,
            )
            n = len(df_pos)
        # Log successful processing and number of rows written
        attempt_log[code] = {
            "status": "ok",
            "message": "written",
            "rows_out": n,
            "outfile": str(out_path),
        }
        match_counts[code] = n
    except Exception as e:
        # Log any unexpected error during processing
        attempt_log[code] = {
            "status": "error",
            "message": str(e),
            "rows_out": 0,
            "outfile": str(out_path),
        }

def _import_route_sixcol(code: str, friendly_name: str, header_text: str, do_title: bool):
    """
    Import a 6-column Calendar route.
    Supports input subset files with:
      - Named columns ID, PITDATE, FREQUENCY, FISCALPERIOD, ITEMCODE, VALUE, or
      - Generic positional columns (first 6 columns used).
    The function standardizes to a 6-column output and writes it to a
    filtered_<friendly_name>.txt file in out_dir.
    """
    subset_path = subset_dir / f"subset_{code}.txt"  # Expected input file path
    out_path = out_dir / f"filtered_{friendly_name}.txt"  # Output file path
    # If the subset file does not exist, log as missing and skip
    if not subset_path.exists():
        attempt_log[code] = {
            "status": "missing_subset",
            "message": f"Missing {subset_path}",
            "rows_out": 0,
            "outfile": str(out_path),
        }
        return
    try:
        # Read the subset file as a pipe-separated text file with all columns as strings
        df = pd.read_csv(subset_path, sep="|", dtype=str, engine="c", encoding="latin1")
        # Normalize column names by stripping spaces
        cols = [c.strip() for c in df.columns]
        # Build a mapping from normalized (lowercased, no spaces) names to original names
        colmap = {c.lower().replace(" ", ""): c for c in cols}
        # Check if the expected six logical columns are present
        if {
            "id",
            "pitdate",
            "frequency",
            "fiscalperiod",
            "itemcode",
            "value",
        }.issubset(colmap.keys()):
            # Use explicit named columns when available
            wanted = [
                colmap["id"],
                colmap["pitdate"],
                colmap["frequency"],
                colmap["fiscalperiod"],
                colmap["itemcode"],
                colmap["value"],
            ]
            _write_df_with_fixed_header(
                df,
                wanted,
                header_text,
                out_path,
                do_title,
                value_col_name=colmap["value"],
            )
            n = len(df)
        else:
            # Fallback: require at least 6 columns to map positionally
            if df.shape[1] < 6:
                attempt_log[code] = {
                    "status": "error",
                    "message": f"Expected >=6 columns, found {df.shape[1]}",
                    "rows_out": 0,
                    "outfile": str(out_path),
                }
                return
            # Take the first six columns as the standardized output
            wanted_pos = [0, 1, 2, 3, 4, 5]
            df_pos = df.iloc[:, wanted_pos]
            valname = df_pos.columns[-1]  # Last column is treated as value column
            _write_df_with_fixed_header(
                df_pos,
                list(df_pos.columns),
                header_text,
                out_path,
                do_title,
                value_col_name=valname,
            )
            n = len(df_pos)
        # Log successful processing and number of rows written
        attempt_log[code] = {
            "status": "ok",
            "message": "written",
            "rows_out": n,
            "outfile": str(out_path),
        }
        match_counts[code] = n
    except Exception as e:
        # Log any unexpected error during processing
        attempt_log[code] = {
            "status": "error",
            "message": str(e),
            "rows_out": 0,
            "outfile": str(out_path),
        }

# Execute imports only for the required codes
for code in sorted(required_codes):
    # Decide whether to treat the code as a four-column or six-column route
    if code in ROUTES:
        friendly, header_text, do_title = ROUTES[code]
        _import_route_fourcol(code, friendly, header_text, do_title)
    elif code in ROUTES_CALENDAR:
        friendly, header_text, do_title = ROUTES_CALENDAR[code]
        _import_route_sixcol(code, friendly, header_text, do_title)
    gc.collect()  # Explicitly invoke garbage collection to free memory between iterations

# Summary statistics for all processed codes
total_ok = sum(1 for v in attempt_log.values() if v["status"] == "ok")
total_missing = sum(1 for v in attempt_log.values() if v["status"] == "missing_subset")
total_error = sum(1 for v in attempt_log.values() if v["status"] == "error")

print("Finished imports from subset files for required codes only.")
print(f"  Codes processed: {len(required_codes)}")
print(f"  OK: {total_ok}, Missing subset: {total_missing}, Errors: {total_error}")
print("Per-code results:")
for code in sorted(required_codes):
    info = attempt_log.get(code, {})
    print(
        f"  {code}: {info.get('status','n/a')} | "
        f"rows_out={info.get('rows_out',0)} | "
        f"out={info.get('outfile','-')} | {info.get('message','')}"
    )
print("\nAll filtered datasets saved in:")
print(f"  {out_dir}")


Finished imports from subset files for required codes only.
  Codes processed: 9
  OK: 9, Missing subset: 0, Errors: 0
Per-code results:
  05350: ok | rows_out=7162436 | out=/home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_FYE.txt | written
  06001: ok | rows_out=231806 | out=/home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_CompanyName.txt | written
  06026: ok | rows_out=131340 | out=/home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_Nation.txt | written
  06027: ok | rows_out=121852 | out=/home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_NationCode.txt | written
  06100: ok | rows_out=121531 | out=/home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_EntityType.txt | written
  07021: ok | rows_out=227315 | out=/home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_SIC.txt | written
  11503: 

### Support Excel-file

#### Current & Fundamentals tab

In [46]:
# =============================================================================
# BUILD VALUE-CODING LOOKUP FROM EXCEL DEFINITIONS AND EXPORT TO TEXT
# =============================================================================
# This cell:
#   - Reads selected sheets ("WS_Current" and "WS_FV") from the PIT table
#     definition Excel file.
#   - Extracts only the relevant columns (ItemCode and ItemName) from each sheet
#     and stacks them into a single combined DataFrame.
#   - Filters the combined DataFrame so that only rows with purely numeric
#     ItemCodes are retained (non-numeric codes are removed).
#   - Normalizes ItemCodes by adding a leading zero to 4-digit codes so that all
#     codes have a consistent 5-digit format.
#   - Derives a new categorical column "Source" based on ItemCode prefixes
#     (e.g., '01' -> IS, '02'/'03' -> BS, '04' -> CFS, '05' -> Market, '10' -> Other).
#   - Exports the resulting standardized value-coding table to a pipe-separated
#     text file with header [ItemCode|ItemName|Source] in Temp_file_path_EoC.
#
# Filtering / transformation focus:
#   - Filters out any rows where ItemCode is not strictly digits using a regex.
#   - Transforms 4-digit ItemCodes into 5-digit codes by prefixing with '0'.
#   - Transforms ItemCodes into a categorical "Source" variable via prefix-based
#     mapping, enabling downstream grouping or filtering by data source.
# =============================================================================

# --- Inputs/outputs ---
temp_file  = f"{Temp_file_path_EoC}/ValueCoding.txt"  # Target output text file path

# --- Which sheets do you want? ---
# List of sheet names from the Excel file that will be read and combined
sheets_to_export = ["WS_Current", "WS_FV"]

# --- Read the sheets into DataFrames (only col 1 + col 4) ---
dfs = []  # List to hold DataFrames for each sheet
for sh in sheets_to_export:
    # Read only the first and fourth columns from the specified sheet
    df = pd.read_excel(
        Excel_file_path,
        sheet_name=sh,
        dtype=str,   # Read all values as strings to preserve leading zeros
        usecols=[0, 3],  # Use column indices 0 and 3 (1st and 4th logical columns)
        engine="xlrd"
    )
    # Explicitly rename columns to standardized names
    df.columns = ["ItemCode", "ItemName"]
    dfs.append(df)  # Append each sheet's DataFrame to the list

# --- Combine them ---
# Vertically concatenate the DataFrames from all selected sheets into one
combined_df = pd.concat(dfs, ignore_index=True)

# --- Filter out rows that do not have a number in the first column (ItemCode) ---
# Keep only rows where ItemCode consists of digits only (no letters or symbols)
combined_df = combined_df[
    combined_df['ItemCode'].astype(str).str.match(r'^\d+$')
].copy()

# --- Add leading zero to 4-digit ItemCodes ---
# Normalize ItemCodes: if a code is a 4-character string, prepend a '0'
combined_df['ItemCode'] = combined_df['ItemCode'].apply(
    lambda x: f'0{x}' if isinstance(x, str) and len(x) == 4 else x
)

# --- Add a 'Source' column based on the first few characters of ItemCode ---
def assign_source(item_code):
    """
    Map an ItemCode to a high-level source category based on its prefix:
      - '01' -> 'IS' (Income Statement)
      - '02' or '03' -> 'BS' (Balance Sheet)
      - '04' -> 'CFS' (Cash Flow Statement)
      - '05' -> 'Market'
      - '10' -> 'Other'
      - Anything else or missing -> 'n.a.'
    """
    if pd.isna(item_code):       # Handle missing values
        return 'n.a.'
    item_code_str = str(item_code).strip()  # Normalize to trimmed string
    if item_code_str.startswith('01'):
        return 'IS'
    elif item_code_str.startswith('02') or item_code_str.startswith('03'):
        return 'BS'
    elif item_code_str.startswith('04'):
        return 'CFS'
    elif item_code_str.startswith('05'):
        return 'Market'
    elif item_code_str.startswith('10'):
        return 'Other'  # ItemCodes starting with '10' are categorized as 'Other'
    else:
        return 'n.a.'   # All other prefixes default to 'n.a.'

# Apply the source assignment function to derive the 'Source' column
combined_df['Source'] = combined_df['ItemCode'].apply(assign_source)

# --- Export to txt with header ---
# Write the final DataFrame to a pipe-separated text file, including the header
combined_df.to_csv(temp_file, sep="|", index=False, header=True, encoding="utf-8")

print(f"Exported {len(combined_df):,} rows with headers [ItemCode|ItemName|Source] to {temp_file}")
print("\n=== Preview of the exported data ===")
display(combined_df.head())


Exported 477 rows with headers [ItemCode|ItemName|Source] to /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/ValueCoding.txt

=== Preview of the exported data ===


Unnamed: 0,ItemCode,ItemName,Source
2,5006,Market Price Current,Market
3,5007,Market Price YTD High Current,Market
4,5008,Market Price YTD Low Current,Market
5,5009,Date of Current Price,Market
6,5091,Market Price 52 Week High Current,Market


#### Only Fundamentals tab

In [47]:
# =============================================================================
# BUILD FUNDAMENTALS-CODING LIST FROM EXCEL AND EXPORT TO TEXT
# =============================================================================
# This cell:
#   - Reads ItemCodes from the "WS_FV" sheet of the PIT table definition
#     Excel file, using only the first column.
#   - Combines all selected sheets (here only "WS_FV") into a single DataFrame.
#   - Drops the first four rows, which typically contain metadata, titles, or
#     non-code content that should not be part of the fundamentals code list.
#   - Normalizes ItemCodes by adding a leading zero to 4-digit codes so that
#     they follow a consistent 5-digit format.
#   - Exports the resulting standardized fundamentals-coding list to a
#     pipe-separated text file with a header in Temp_file_path_EoC.
#
# Filtering / transformation focus:
#   - Structural filtering: removes the first four rows that are not part of
#     the actual code list.
#   - Data transformation: converts 4-character ItemCodes into 5-character
#     codes by prefixing them with '0', ensuring a consistent code format for
#     downstream joins, merges, or lookups.
# =============================================================================

# --- Inputs/outputs ---
temp_file  = f"{Temp_file_path_EoC}/FundamentalsCoding.txt"  # Target output text file path

# --- Which sheets do you want? ---
# List of sheet names from the Excel file that will be read and combined
sheets_to_export = ["WS_FV"]

# --- Read the sheets into DataFrames (only col 1) ---
dfs = []  # List to collect DataFrames for each sheet
for sh in sheets_to_export:
    # Read only the first column from the specified sheet
    df = pd.read_excel(
        Excel_file_path,
        sheet_name=sh,
        dtype=str,   # Read values as strings to preserve leading zeros
        usecols=[0],  # Only take column 1 (index 0)
        engine="xlrd"
    )
    # Explicitly rename the single column to a standardized name
    df.columns = ["ItemCode"]
    dfs.append(df)  # Append the sheet's DataFrame to the list

# --- Combine them ---
# Concatenate all DataFrames from selected sheets into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# --- Remove the first 4 rows ---
# Drop the first four rows (e.g., header-like or metadata rows) and reset index
combined_df = combined_df.iloc[4:].reset_index(drop=True)

# --- Add leading zero to 4-digit ItemCodes ---
# Normalize ItemCodes: if a code is a 4-character string, prepend a '0'
combined_df['ItemCode'] = combined_df['ItemCode'].apply(
    lambda x: f'0{x}' if isinstance(x, str) and len(x) == 4 else x
)

# --- Export to txt with header ---
# Write the final list of fundamentals ItemCodes to a pipe-separated text file
combined_df.to_csv(temp_file, sep="|", index=False, header=True, encoding="utf-8")

print(f"Exported {len(combined_df):,} rows with headers [ItemCode|ItemName] to {temp_file}")


Exported 380 rows with headers [ItemCode|ItemName] to /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/FundamentalsCoding.txt


## 2.0 General Summary Statistics

### NationCode

In [48]:
# =============================================================================
# QUALITY CHECKS ON FILTERED NATION CODE DATA
# =============================================================================
# This cell:
#   - Reads a pre-filtered NationCode dataset from Temp_file_path_EoC.
#   - Computes basic ID-level statistics:
#       * Number of unique IDs in the file.
#       * Number of IDs that appear more than once (indicating multiple records
#         per entity).
#   - Evaluates data quality for the NationCode column by:
#       * Normalizing values to strings and stripping whitespace.
#       * Flagging rows where NationCode is missing, empty, or contains
#         placeholder/null-like values such as "na", "nan", "null", etc.
#   - Reports the number of rows that fail this quality check so the extent of
#     invalid NationCode entries is visible.
#
# Filtering / transformation focus:
#   - Creates a boolean error_mask over rows based on transformed NationCode
#     (whitespace-stripped, lowercased), identifying missing or invalid entries.
#   - Uses this mask to count how many rows have problematic NationCode values,
#     which can be used to filter those rows out in downstream processing.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_NationCode.txt'  # Path to the filtered NationCode file

VALUE_COL = "NationCode"  # Name of the value column to check for data quality

# Read
df = pd.read_csv(
    FILE,
    sep="|",          # Use pipe as the column separator
    encoding="utf-8", # Assume UTF-8 encoding
    dtype=str,        # Read all columns as strings for consistent processing
    keep_default_na=True  # Let pandas interpret standard NA markers as NaN
)

# Quick preview of the loaded data for sanity check
print(f"\n=== Preview of {FILE} ===")
print(df.head(), "\n")

# Stats
# Count how many distinct entities (IDs) are present
unique_ids = df["ID"].nunique()

# Count how many IDs occur more than once (multiple records per ID)
multi_used_ids = (df["ID"].value_counts() > 1).sum()

# Work with the value column as pandas StringDtype for robust string operations
val = df[VALUE_COL].astype("string")

# Strip leading/trailing whitespace to normalize text values
val_stripped = val.str.strip()

# Build a mask of rows where NationCode is considered invalid:
#   - missing (NaN),
#   - empty after stripping,
#   - or equal (case-insensitive) to typical placeholder/null strings
error_mask = val.isna() | (
    (val_stripped == "") |
    val_stripped.str.lower().isin(
        {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
    )
)

# Total number of rows with invalid NationCode values
error_rows = int(error_mask.sum())

# Print summary statistics about ID usage and NationCode quality
print(f"File: {FILE}")
print(f"Number of Unique IDs: {unique_ids}")
print(f"Number of IDs used multiple times: {multi_used_ids}")
print(f'Rows with error in "{VALUE_COL}": {error_rows}')



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_NationCode.txt ===
          ID    PIT DATE ItemCode NationCode
0  C00948205  2021-07-09     6027       S840
1  C02500770  1995-12-29     6027        S25
2  C0250077A  1999-10-01     6027        S25
3  C0250077B  1999-10-01     6027        S25
4  C0250077C  1999-10-01     6027        S25 

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_NationCode.txt
Number of Unique IDs: 121008
Number of IDs used multiple times: 449
Rows with error in "NationCode": 0


### Nation

In [49]:
# =============================================================================
# QUALITY CHECKS ON FILTERED NATION DATA
# =============================================================================
# This cell:
#   - Reads a pre-filtered Nation dataset from Temp_file_path_EoC.
#   - Computes basic ID-level statistics:
#       * Number of unique IDs present in the file.
#       * Number of IDs that occur more than once (indicating multiple records
#         per entity).
#   - Evaluates data quality for the Nation column by:
#       * Converting values to a pandas string type and stripping whitespace.
#       * Flagging rows where Nation is missing, empty, or matches common
#         placeholder/null-like strings (e.g., "na", "null", "n/a").
#   - Reports how many rows have invalid Nation values, providing an overview
#     of potential data quality issues in this attribute.
#
# Filtering / transformation focus:
#   - Constructs a boolean error_mask over rows based on normalized Nation
#     values (whitespace-stripped and lowercased).
#   - Uses this mask to count rows with missing or invalid Nation entries,
#     which can later be used to filter such records or impute values.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_Nation.txt'  # Path to the filtered Nation file

VALUE_COL = "Nation"  # Column whose content will be checked for quality issues

# Read the filtered Nation file into a DataFrame
df = pd.read_csv(
    FILE,
    sep="|",           # Pipe-separated file
    encoding="utf-8",  # UTF-8 encoding
    dtype=str,         # Read all columns as strings for consistent handling
    keep_default_na=True  # Interpret standard NA markers as NaN
)

# Display a small preview of the loaded data
print(f"\n=== Preview of {FILE} ===")
print(df.head(), "\n")

# Compute statistics on ID usage
unique_ids = df["ID"].nunique()                 # Number of distinct IDs
multi_used_ids = (df["ID"].value_counts() > 1).sum()  # IDs that appear more than once

# Work with the Nation column as a pandas StringDtype for robust string operations
val = df[VALUE_COL].astype("string")

# Strip leading and trailing whitespace to normalize Nation values
val_stripped = val.str.strip()

# Build a mask of rows where Nation is considered invalid:
#   - missing (NaN)
#   - empty string after stripping
#   - equal (case-insensitive) to typical placeholder/null markers
error_mask = val.isna() | (
    (val_stripped == "") |
    val_stripped.str.lower().isin(
        {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
    )
)

# Count the number of rows with invalid Nation values
error_rows = int(error_mask.sum())

# Print summary statistics about ID usage and Nation data quality
print(f"File: {FILE}")
print(f"Number of Unique IDs: {unique_ids}")
print(f"Number of IDs used multiple times: {multi_used_ids}")
print(f'Rows with error in "{VALUE_COL}": {error_rows}')



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_Nation.txt ===
          ID    PIT DATE ItemCode      Nation
0  C00948205  2021-07-09     6026     'Canada
1  C02500770  1999-10-01     6026  'Argentina
2  C0250077A  1999-10-01     6026  'Argentina
3  C0250077B  1999-10-01     6026  'Argentina
4  C0250077C  1999-10-01     6026  'Argentina 

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_Nation.txt
Number of Unique IDs: 120964
Number of IDs used multiple times: 7282
Rows with error in "Nation": 0


### CompanyName

In [50]:
# =============================================================================
# QUALITY CHECKS ON FILTERED COMPANY NAME DATA
# =============================================================================
# This cell:
#   - Loads the filtered company name dataset from Temp_file_path_EoC.
#   - Calculates key ID-level statistics:
#       * Total number of unique IDs.
#       * Number of IDs that appear more than once (indicating multiple
#         company name records per entity).
#   - Performs a quality assessment of the "Name" column by:
#       * Converting to pandas' string type, trimming whitespace, and
#         standardizing text for comparison.
#       * Identifying rows where Name is missing, empty, or equal to
#         placeholder/null-like terms such as "na", "none", or "n/a".
#   - Reports the number of rows in which Name values fail these checks,
#     highlighting potential data quality issues for further cleaning.
#
# Filtering / transformation focus:
#   - Creates an error_mask based on standardized Name values, allowing the
#     dataset to be further filtered or remediated in subsequent steps.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_CompanyName.txt'  # Path to filtered company name file

VALUE_COL = "Name"  # Column to validate for completeness and correctness

# Load the dataset
df = pd.read_csv(
    FILE,
    sep="|",            # File is pipe-separated
    encoding="utf-8",   # Assume UTF-8 encoding
    dtype=str,          # Read all fields as strings for consistent processing
    keep_default_na=True  # Interpret standard NA-like tokens as NaN
)

# Display a small preview for verification
print(f"\n=== Preview of {FILE} ===")
print(df.head(), "\n")

# ID-level stats
unique_ids = df["ID"].nunique()                    # Number of distinct IDs
multi_used_ids = (df["ID"].value_counts() > 1).sum()  # IDs appearing more than once

# Prepare the Name column for validation
val = df[VALUE_COL].astype("string")       # Use pandas string type
val_stripped = val.str.strip()             # Remove surrounding whitespace

# Build a mask identifying rows with invalid or missing Name entries
error_mask = val.isna() | (
    (val_stripped == "") |
    val_stripped.str.lower().isin(
        {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
    )
)

# Count problematic rows
error_rows = int(error_mask.sum())

# Output summary statistics
print(f"File: {FILE}")
print(f"Number of Unique IDs: {unique_ids}")
print(f"Number of IDs used multiple times: {multi_used_ids}")
print(f'Rows with error in "{VALUE_COL}": {error_rows}')



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_CompanyName.txt ===
          ID    PIT DATE ItemCode                             Name
0  C00948205  2021-07-09     6001   'AGRIFORCE GROWING SYSTEMS LTD
1  C02500770  1995-12-29     6001            'SEVEL ARGENTINA S.A.
2  C02500770  2000-06-02     6001  'PEUGEOT CITROEN ARGENTINA S.A.
3  C0250077A  1999-10-01     6001            'SEVEL ARGENTINA S.A.
4  C0250077A  2000-06-02     6001  'PEUGEOT CITROEN ARGENTINA S.A. 

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_CompanyName.txt
Number of Unique IDs: 121008
Number of IDs used multiple times: 54129
Rows with error in "Name": 0


### EntityType

In [51]:
# =============================================================================
# QUALITY CHECKS ON FILTERED ENTITY TYPE DATA
# =============================================================================
# This cell:
#   - Loads the filtered EntityType dataset from Temp_file_path_EoC.
#   - Computes two core ID-level metrics:
#       * Count of unique IDs present.
#       * Count of IDs that occur multiple times.
#   - Performs quality validation of the EntityType field by:
#       * Converting values to pandas' string type.
#       * Stripping whitespace to normalize text input.
#       * Marking rows as invalid if EntityType is missing, empty, or equal
#         to common placeholder/null-like tokens (e.g., "na", "null", "n/a").
#   - Reports the total number of rows where EntityType fails these checks.
#
# Filtering / transformation focus:
#   - Constructs a boolean mask (error_mask) based on transformed EntityType
#     values, enabling downstream filtering or remediation of invalid rows.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_EntityType.txt'  # Input file path for EntityType data

VALUE_COL = "EntityType"  # Column to evaluate for quality issues

# Load the dataset containing ID, PIT Date, ItemCode, and EntityType
df = pd.read_csv(
    FILE,
    sep="|",            # File is pipe-delimited
    encoding="utf-8",   # Assume UTF-8 encoding
    dtype=str,          # Load all fields as strings
    keep_default_na=True  # Treat standard NA tokens as NaN
)

# Display a preview of the loaded data
print(f"\n=== Preview of {FILE} ===")
print(df.head(), "\n")

# Compute ID-level usage stats
unique_ids = df["ID"].nunique()                     # Number of unique IDs
multi_used_ids = (df["ID"].value_counts() > 1).sum()  # IDs with more than one record

# Prepare the EntityType column for validation
val = df[VALUE_COL].astype("string")   # Convert to pandas string dtype
val_stripped = val.str.strip()         # Remove leading/trailing whitespace

# Build mask of invalid or missing EntityType entries
error_mask = val.isna() | (
    (val_stripped == "") |
    val_stripped.str.lower().isin(
        {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
    )
)

# Count invalid rows
error_rows = int(error_mask.sum())

# Output summary statistics
print(f"File: {FILE}")
print(f"Number of Unique IDs: {unique_ids}")
print(f"Number of IDs used multiple times: {multi_used_ids}")
print(f'Rows with error in "{VALUE_COL}": {error_rows}')



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_EntityType.txt ===
          ID    PIT DATE ItemCode EntityType
0  C00948205  2021-07-09     6100         'C
1  C02500770  1999-10-01     6100         'C
2  C0250077A  1999-10-01     6100         'S
3  C0250077B  1999-10-01     6100         'S
4  C0250077C  1999-10-01     6100         'S 

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_EntityType.txt
Number of Unique IDs: 121000
Number of IDs used multiple times: 401
Rows with error in "EntityType": 0


### ADRIndicator

In [52]:
# =============================================================================
# QUALITY CHECKS ON FILTERED ADR INDICATOR DATA
# =============================================================================
# This cell:
#   - Loads the filtered ADRIndicator dataset from Temp_file_path_EoC.
#   - Computes two key ID-based metrics:
#       * Number of unique IDs present in the file.
#       * Number of IDs appearing more than once (multiple PIT records).
#   - Performs a quality validation of the ADRIndicator field by:
#       * Converting values to pandas' string type.
#       * Stripping whitespace to normalize the values.
#       * Identifying rows where ADRIndicator is missing, blank, or matches
#         common placeholder/null-like patterns such as "na", "null", "none".
#   - Counts the total number of invalid ADRIndicator entries and reports all
#     summary statistics.
#
# Filtering / transformation focus:
#   - Builds a boolean mask (error_mask) to identify invalid ADRIndicator values.
#   - Normalizes ADRIndicator through whitespace stripping and lowercasing
#     before applying validation conditions.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_ADRIndicator.txt'  # Path to the filtered ADRIndicator dataset

VALUE_COL = "ADRIndicator"  # Column being validated

# Load the dataset as a DataFrame
df = pd.read_csv(
    FILE,
    sep="|",            # Pipe-separated file
    encoding="utf-8",   # UTF-8 encoding
    dtype=str,          # Treat all fields as strings
    keep_default_na=True  # Convert standard NA tokens to NaN
)

# Display the first few rows for visual inspection
print(f"\n=== Preview of {FILE} ===")
print(df.head(), "\n")

# Compute ID-based usage statistics
unique_ids = df["ID"].nunique()                     # Number of distinct IDs
multi_used_ids = (df["ID"].value_counts() > 1).sum()  # Number of IDs with multiple rows

# Prepare ADRIndicator column for quality checks
val = df[VALUE_COL].astype("string")   # Convert to string dtype for uniform handling
val_stripped = val.str.strip()         # Strip whitespace for normalization

# Construct mask to identify invalid or missing ADRIndicator entries
error_mask = val.isna() | (
    (val_stripped == "") |
    val_stripped.str.lower().isin(
        {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
    )
)

# Total number of problematic rows
error_rows = int(error_mask.sum())

# Output summary information
print(f"File: {FILE}")
print(f"Number of Unique IDs: {unique_ids}")
print(f"Number of IDs used multiple times: {multi_used_ids}")
print(f'Rows with error in "{VALUE_COL}": {error_rows}')



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_ADRIndicator.txt ===
          ID    PIT DATE ItemCode ADRIndicator
0  C036F63D0  2012-05-28    11503           'X
1  C036F63D0  2014-01-15    11503           N'
2  C056879S0  2018-04-23    11503           'X
3  C2461T100  2004-04-23    11503           'X
4  C2461T100  2005-06-10    11503           N' 

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_ADRIndicator.txt
Number of Unique IDs: 2297
Number of IDs used multiple times: 28
Rows with error in "ADRIndicator": 0


### Currency Code

In [53]:
# =============================================================================
# QUALITY CHECKS ON FILTERED CURRENCY CODE DATA
# =============================================================================
# This cell:
#   - Loads the filtered CurrencyCode dataset from Temp_file_path_EoC.
#   - Computes ID-level metrics:
#       * Total number of unique IDs.
#       * Number of IDs that appear in multiple rows.
#   - Performs quality validation of the CurrencyCode column by:
#       * Converting all values to pandas string type for consistent handling.
#       * Stripping whitespace to normalize the values.
#       * Flagging entries that are missing, empty, or equal to common
#         placeholder/null-like terms (e.g., "na", "null", "none").
#   - Reports how many CurrencyCode entries fail these validation rules.
#
# Filtering / transformation focus:
#   - Constructs a boolean quality mask (error_mask) from normalized text values.
#   - Enables downstream filtering or correction of invalid CurrencyCode entries.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_CurrencyCode.txt'  # Path to currency code file

VALUE_COL = "CurrencyCode"  # Column to evaluate

# Load dataset
df = pd.read_csv(
    FILE,
    sep="|",            # Pipe-delimited file
    encoding="utf-8",   # UTF-8 encoding
    dtype=str,          # Load all fields as strings
    keep_default_na=True  # Handle standard NA-like values as NaN
)

# Preview first rows of the dataset
print(f"\n=== Preview of {FILE} ===")
print(df.head(), "\n")

# Compute ID-level metrics
unique_ids = df["ID"].nunique()                     # Distinct IDs
multi_used_ids = (df["ID"].value_counts() > 1).sum()  # IDs with multiple rows

# Normalize value column for validation
val = df[VALUE_COL].astype("string")   # Convert to pandas string dtype
val_stripped = val.str.strip()         # Remove leading/trailing whitespace

# Identify missing or invalid CurrencyCode values
error_mask = val.isna() | (
    (val_stripped == "") |
    val_stripped.str.lower().isin(
        {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
    )
)

# Count problematic rows
error_rows = int(error_mask.sum())

# Output summary
print(f"File: {FILE}")
print(f"Number of Unique IDs: {unique_ids}")
print(f"Number of IDs used multiple times: {multi_used_ids}")
print(f'Rows with error in "{VALUE_COL}": {error_rows}')



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_CurrencyCode.txt ===
          ID    PIT DATE ItemCode CurrencyCode
0  C00948205  2021-07-09    56027         'Usd
1  C02500770  1995-12-29    56027         'Ars
2  C0250077A  1999-10-01    56027         'Ars
3  C0250077B  1999-10-01    56027         'Ars
4  C0250077C  1999-10-01    56027         'Ars 

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_CurrencyCode.txt
Number of Unique IDs: 121007
Number of IDs used multiple times: 7395
Rows with error in "CurrencyCode": 0


### FYE

In [54]:
# =============================================================================
# QUALITY CHECKS AND ERROR HANDLING ON FILTERED FYE DATA
# =============================================================================
# This cell:
#   - Attempts to read a pre-filtered FYE dataset from Temp_file_path_EoC.
#   - Validates that the expected column (FYE) is present in the data.
#   - If available, computes:
#       * Number of unique IDs.
#       * Number of IDs with multiple rows.
#       * Number of rows where FYE is missing, empty, or a placeholder/null-like
#         value.
#   - Prints a concise summary of dataset size and FYE data quality.
#   - Handles missing file and unexpected errors with informative messages.
#
# Filtering / transformation focus:
#   - Builds an error_mask over rows based on normalized FYE values
#     (whitespace-stripped and lowercased) to identify problematic entries.
#   - Uses this mask to report how many rows have invalid FYE values, which
#     supports later filtering or correction steps.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_FYE.txt'  # Path to the filtered FYE file

VALUE_COL = "FYE"  # Name of the FYE column to validate and analyze

try:
    # Read the filtered FYE file into a DataFrame
    fye_df = pd.read_csv(
        FILE,
        sep="|",           # Pipe-separated text file
        encoding="utf-8",  # UTF-8 encoding
        dtype=str,         # Read all columns as strings
        keep_default_na=True  # Interpret default NA markers as NaN
    )

    # Show a preview of the loaded data
    print(f"\n=== Preview of {FILE} ===")
    print(fye_df.head(), "\n")

    # Check if the expected FYE column exists in the DataFrame
    if VALUE_COL not in fye_df.columns:
        print(f"Error: Column '{VALUE_COL}' not found in {FILE}.")
        print(f"Available columns are: {fye_df.columns.tolist()}")
        # No further processing if the required column is missing
    else:
        # Compute statistics on ID usage
        unique_ids = fye_df["ID"].nunique()
        multi_used_ids = (fye_df["ID"].value_counts() > 1).sum()

        # Work with the FYE column as pandas StringDtype
        val = fye_df[VALUE_COL].astype("string")

        # Strip leading and trailing whitespace from FYE values
        val_stripped = val.str.strip()

        # Build a mask for rows where FYE is considered invalid:
        #   - missing (NaN),
        #   - empty after stripping,
        #   - or equal (case-insensitive) to typical placeholder/null markers
        error_mask = val.isna() | (
            (val_stripped == "") |
            val_stripped.str.lower().isin(
                {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
            )
        )

        # Count rows with invalid FYE values
        error_rows = int(error_mask.sum())

        # Print main statistics and FYE quality indicators
        print(f"File: {FILE}")
        print(f"Number of Unique IDs: {unique_ids}")
        print(f"Number of IDs used multiple times: {multi_used_ids}")
        print(f'Rows with error in "{VALUE_COL}": {error_rows}')

        # --- Summary overview ---
        # High-level summary of dataset size and FYE quality metrics
        print("\nSummary:")
        print(f"Total rows: {len(fye_df)}")
        print(f"Unique IDs: {unique_ids}")
        print(f"IDs with >1 row: {multi_used_ids}")
        print(f'Rows with initial error in "{VALUE_COL}": {error_rows}')

except FileNotFoundError:
    # Handle scenario where the FYE file has not been created or is missing
    print(f"Error: File not found at {FILE}. Please ensure the file was created in the previous steps.")
except Exception as e:
    # Catch any other unexpected exceptions and display the message
    print(f"An unexpected error occurred: {e}")



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_FYE.txt ===
          ID    PIT DATE ItemCode        FYE
0  C00948205  2021-07-09     5350  D20181231
1  C00948205  2021-07-09     5350  D20191231
2  C00948205  2021-07-09     5350  D20201231
3  C00948205  2021-07-09     5350  D20190930
4  C00948205  2021-07-09     5350  D20191231 

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_FYE.txt
Number of Unique IDs: 120948
Number of IDs used multiple times: 120564
Rows with error in "FYE": 0

Summary:
Total rows: 7162436
Unique IDs: 120948
IDs with >1 row: 120564
Rows with initial error in "FYE": 0


### Update Code

In [55]:
# =============================================================================
# QUALITY CHECKS AND UNIQUE VALUE OVERVIEW FOR UPDATE CODE DATA
# =============================================================================
# This cell:
#   - Loads the filtered UpdateCode dataset created earlier.
#   - Validates that the expected Value column (containing update codes) exists.
#   - Computes key dataset metrics:
#       * Number of unique IDs.
#       * Number of IDs appearing more than once.
#       * Number of rows with invalid or missing update code values.
#   - Extracts all unique update code values and produces a sorted list for
#     manual inspection.
#
# Filtering / transformation focus:
#   - Normalizes the Value column using pandas' string dtype and whitespace
#     stripping.
#   - Applies a validation mask that flags null-like or empty entries, enabling
#     downstream filtering or quality checks.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_UpdateCode.txt'  # Path to the filtered UpdateCode file

VALUE_COL = "Value"  # Column expected to hold the update code values

try:
    # Load the filtered update code dataset
    df = pd.read_csv(
        FILE,
        sep="|",            # File is pipe-delimited
        encoding="utf-8",   # Use UTF-8 encoding
        dtype=str,          # Load all columns as strings
        keep_default_na=True  # Interpret standard NA-like tokens as NaN
    )

    # Display a preview of the first few rows
    print(f"\n=== Preview of {FILE} ===")
    display(df.head(), "\n")

    # Ensure that the expected column exists
    if VALUE_COL not in df.columns:
        raise KeyError(
            f"Expected column '{VALUE_COL}' not found in {FILE}. "
            f"Available columns: {df.columns.tolist()}"
        )

    # Compute ID-level metrics
    unique_ids = df["ID"].nunique()                      # Number of unique entities
    multi_used_ids = (df["ID"].value_counts() > 1).sum() # Entities with multiple rows

    # Normalize the Value column for quality testing
    val = df[VALUE_COL].astype("string")  # Convert to pandas string dtype
    val_stripped = val.str.strip()        # Remove leading/trailing whitespace

    # Identify invalid update code entries:
    # - Missing values
    # - Empty strings
    # - Placeholder/null-like tokens
    error_mask = val.isna() | (
        (val_stripped == "") |
        val_stripped.str.lower().isin(
            {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
        )
    )
    error_rows = int(error_mask.sum())

    # Output summary statistics
    print(f"File: {FILE}")
    print(f"Number of Unique IDs: {unique_ids}")
    print(f"Number of IDs used multiple times: {multi_used_ids}")
    print(f'Rows with error in "{VALUE_COL}": {error_rows}')

    # Extract list of unique update code values (excluding NaN)
    unique_values = df[VALUE_COL].dropna().unique().tolist()

    # Sort unique values alphabetically, placing None/NaN at the end
    unique_values.sort(key=lambda x: (x is None, x))

    print(f"\nUnique values in '{VALUE_COL}' ({len(unique_values)} total):")
    print(unique_values)

except FileNotFoundError:
    # File does not exist or was not created earlier
    print(f"Error: File not found at {FILE}.")
except Exception as e:
    # Handle any other unexpected errors
    print(f"An unexpected error occurred: {e}")



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_UpdateCode.txt ===


Unnamed: 0,ID,PIT Date,Frequency,FiscalPeriod,ItemCode,Value
0,C02500770,1995-12-29,A,1985,57034,S3
1,C02500770,1995-12-29,A,1986,57034,S3
2,C02500770,1995-12-29,A,1987,57034,S3
3,C02500770,1995-12-29,A,1988,57034,S3
4,C02500770,1995-12-29,A,1989,57034,S3


'\n'

File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_UpdateCode.txt
Number of Unique IDs: 111895
Number of IDs used multiple times: 111464
Rows with error in "Value": 0

Unique values in 'Value' (9 total):
['Ns', 'S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7']


### SIC Code

In [56]:
# =============================================================================
# QUALITY CHECKS AND UNIQUE VALUE OVERVIEW FOR UPDATE CODE DATA
# =============================================================================
# This cell:
#   - Loads the filtered UpdateCode dataset created earlier.
#   - Validates that the expected Value column (containing update codes) exists,
#     or falls back to a suitable alternative (e.g., 'SIC').
#   - Computes key dataset metrics:
#       * Number of unique IDs.
#       * Number of IDs appearing more than once.
#       * Number of rows with invalid or missing update code values.
#   - Extracts all unique update code values and produces a sorted list for
#     manual inspection.
#
# Filtering / transformation focus:
#   - Normalizes the Value column using pandas' string dtype and whitespace
#     stripping.
#   - Applies a validation mask that flags null-like or empty entries, enabling
#     downstream filtering or quality checks.
# =============================================================================

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_SIC.txt'  # Path to the filtered UpdateCode file

PREFERRED_VALUE_COL = "Value"  # Original expected column name
FALLBACK_VALUE_COLS = ["SIC", "ItemCode"]  # Fallbacks if 'Value' is not present

try:
    # Load the filtered update code dataset
    df = pd.read_csv(
        FILE,
        sep="|",              # File is pipe-delimited
        encoding="utf-8",     # Use UTF-8 encoding
        dtype=str,            # Load all columns as strings
        keep_default_na=True  # Interpret standard NA-like tokens as NaN
    )

    # Display a preview of the first few rows
    print(f"\n=== Preview of {FILE} ===")
    display(df.head(), "\n")

    # Determine which column to use as the "value" column
    if PREFERRED_VALUE_COL in df.columns:
        VALUE_COL = PREFERRED_VALUE_COL
    else:
        # Try fallbacks
        VALUE_COL = None
        for col in FALLBACK_VALUE_COLS:
            if col in df.columns:
                VALUE_COL = col
                print(
                    f"Info: Expected column '{PREFERRED_VALUE_COL}' not found. "
                    f"Using '{VALUE_COL}' as the value column instead."
                )
                break

        if VALUE_COL is None:
            # Nothing suitable found → raise a clear error
            raise KeyError(
                f"None of the candidate value columns "
                f"{[PREFERRED_VALUE_COL] + FALLBACK_VALUE_COLS} "
                f"were found in {FILE}. Available columns: {df.columns.tolist()}"
            )

    # Compute ID-level metrics
    unique_ids = df["ID"].nunique()                      # Number of unique entities
    multi_used_ids = (df["ID"].value_counts() > 1).sum() # Entities with multiple rows

    # Normalize the chosen value column for quality testing
    val = df[VALUE_COL].astype("string")  # Convert to pandas string dtype
    val_stripped = val.str.strip()        # Remove leading/trailing whitespace

    # Identify invalid update code entries:
    # - Missing values
    # - Empty strings
    # - Placeholder/null-like tokens
    error_mask = val.isna() | (
        (val_stripped == "") |
        val_stripped.str.lower().isin(
            {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
        )
    )
    error_rows = int(error_mask.sum())

    # Output summary statistics
    print(f"File: {FILE}")
    print(f"Using column '{VALUE_COL}' as value column.")
    print(f"Number of Unique IDs: {unique_ids}")
    print(f"Number of IDs used multiple times: {multi_used_ids}")
    print(f'Rows with error in "{VALUE_COL}": {error_rows}')

    # Extract list of unique update code values (excluding NaN)
    unique_values = df[VALUE_COL].dropna().unique().tolist()

    # Sort unique values alphabetically, placing None/NaN at the end
    unique_values.sort(key=lambda x: (x is None, x))

    print(f"\nUnique values in '{VALUE_COL}' ({len(unique_values)} total):")
    print(unique_values)

except FileNotFoundError:
    # File does not exist or was not created earlier
    print(f"Error: File not found at {FILE}.")
except Exception as e:
    # Handle any other unexpected errors
    print(f"An unexpected error occurred: {e}")



=== Preview of /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_SIC.txt ===


Unnamed: 0,ID,PIT DATE,ItemCode,SIC
0,C02500770,1995-12-29,7021,S3711
1,C02520200,1996-05-03,7021,S3312
2,C02520200,2001-11-30,7021,S3321
3,C02520200,2002-11-15,7021,S3317
4,C02520200,2005-04-15,7021,S3312


'\n'

Info: Expected column 'Value' not found. Using 'SIC' as the value column instead.
File: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_SIC.txt
Using column 'SIC' as value column.
Number of Unique IDs: 103693
Number of IDs used multiple times: 53756
Rows with error in "SIC": 0

Unique values in 'SIC' (1379 total):
['Ns', 'S0', 'S100', 'S1000', 'S1010', 'S1011', 'S1020', 'S1021', 'S1030', 'S1031', 'S1040', 'S1041', 'S1044', 'S1061', 'S1081', 'S1090', 'S1094', 'S1099', 'S110', 'S111', 'S112', 'S115', 'S116', 'S119', 'S1200', 'S1211', 'S1220', 'S1221', 'S1222', 'S1231', 'S1240', 'S1241', 'S1300', 'S131', 'S1310', 'S1311', 'S132', 'S1320', 'S1321', 'S133', 'S134', 'S1362', 'S1371', 'S1380', 'S1381', 'S1382', 'S1389', 'S139', 'S1400', 'S1410', 'S1411', 'S1420', 'S1422', 'S1423', 'S1429', 'S1440', 'S1442', 'S1446', 'S1450', 'S1455', 'S1459', 'S1470', 'S1474', 'S1475', 'S1476', 'S1479', 'S1481', 'S1490', 'S1491', 'S1499', 'S1500', 'S1520', 'S1521', 'S1522',

## 3.0 Adjustments and Cleaning

### NationCode

#### Drop ItemCode Column and Ensure Correct Spelling of NationCode

In [57]:
# Summary:
# This cell loads a pipe-delimited text file, removes and renames specific columns,
# applies targeted string cleaning to the 'NatCo' column (removing a leading 'S'
# and zero-padding when needed), sorts the dataset by ID and PIT DATE to ensure
# deterministic ordering, and finally writes the cleaned result to a new file.
# The focus of the transformation is on filtering out an unused column, renaming
# a key identifier, and standardizing the 'NatCo' values.

input_path  = f'{Temp_file_path_EoC}/filtered_NationCode.txt'
output_path_v2 = f'{Temp_file_path_EoC}/filtered_NationCode_v2.txt'

# Load the pipe-delimited input file into a DataFrame
df = pd.read_csv(input_path, sep="|", dtype=str, encoding="utf-8")

# Drop the ItemCode column if it exists, ignoring errors if it does not
if "ItemCode" in df.columns:
    df = df.drop(columns=["ItemCode"], errors="ignore")

# Rename 'NationCode' to 'NatCo' to standardize column naming,
# raising an error if the source column does not exist
if "NationCode" in df.columns:
    df = df.rename(columns={"NationCode": "NatCo"})
else:
    raise KeyError("Expected column 'NationCode' not found in input file.")

# Clean the 'NatCo' column by removing a leading 'S'
# and adding a leading zero only when the resulting string has length 2
df["NatCo"] = (
    df["NatCo"]
    .astype("string")
    .str.replace(r"^S", "", regex=True)               # Remove leading S
    .apply(lambda x: f"0{x}" if isinstance(x, str) and len(x) == 2 else x)  # Conditional zero-padding
)

# Sort the dataset by ID and PIT DATE using a stable merge sort for consistent ordering
df = df.sort_values(by=["ID", "PIT DATE"], ascending=[True, True], kind="mergesort")

# Write the cleaned and sorted DataFrame back to disk using pipe delimiters
df.to_csv(output_path_v2, sep="|", index=False, encoding="utf-8")

# Display confirmation and preview of the resulting data
print(f"Cleaned and sorted file saved to: {output_path_v2}")
print(df.head())


Cleaned and sorted file saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_NationCode_v2.txt
          ID    PIT DATE NatCo
0  C00948205  2021-07-09   840
1  C02500770  1995-12-29   025
2  C0250077A  1999-10-01   025
3  C0250077B  1999-10-01   025
4  C0250077C  1999-10-01   025


#### Get Overview of the Data Quality

In [58]:
# Summary:
# This cell reads the cleaned NationCode file and performs several analytical checks:
#   - calculates how many distinct NatCo values each ID has and identifies IDs with >1 NatCo
#   - detects IDs where NatCo is missing or contains typical invalid placeholders
#   - inspects and prints the set of unique NatCo values in the dataset
#   - prints an overall summary of row counts, unique IDs, multi-NatCo IDs, and missing NatCo IDs

input_path = f'{Temp_file_path_EoC}/filtered_NationCode_v2.txt'

# Load the cleaned, pipe-delimited file into a DataFrame, keeping all columns as strings
df = pd.read_csv(input_path, sep="|", dtype=str)

# Group by 'ID' and count how many distinct non-null NatCo values each ID has
NatCo_counts = (
    df.groupby("ID")["NatCo"]
    .nunique(dropna=True)  # count unique non-NaN NatCo values per ID
    .reset_index(name="UniqueNatCos")
)

# Filter to IDs that have more than one unique NatCo value
ids_with_multiple_countries = NatCo_counts[NatCo_counts["UniqueNatCos"] > 1]
num_multiple_NatCo_ids = len(ids_with_multiple_countries)

# Report how many IDs have multiple NatCo values and show a preview of these IDs
print(f"IDs with >1 unique NatCo: {num_multiple_NatCo_ids}")
print(ids_with_multiple_countries.head())

# Build a boolean mask to identify rows with missing or invalid NatCo values.
# The mask covers:
#   - NaN values
#   - empty strings (after stripping whitespace)
#   - common placeholder tokens like 'NA', 'NAN', 'NONE', and "n'"
missing_mask = (
    df["NatCo"].isna() |
    df["NatCo"].str.strip().eq("") |
    df["NatCo"].str.upper().isin(["NA", "NAN", "NONE", "n'"])
)

# Extract the unique IDs that have at least one row with missing or invalid NatCo
ids_with_missing_NatCo = df[missing_mask]["ID"].unique()
num_missing_NatCo_ids = len(ids_with_missing_NatCo)

# Report how many IDs are affected by missing or invalid NatCo values
print(f"\nIDs with missing/invalid NatCo: {num_missing_NatCo_ids}")

# Optionally, show some example rows where NatCo is missing or invalid
if num_missing_NatCo_ids > 0:
    print("\nExample missing NatCo entries:")
    print(df[missing_mask].head())

# Collect the set of unique non-null NatCo values in the dataset
unique_NatCo = df["NatCo"].dropna().unique()
print(f"\nUnique NatCo values ({len(unique_NatCo)} total):")
print(sorted(unique_NatCo))

# Print a compact summary of the dataset and the previous analyses
print("\nSummary:")
print(f"Total rows: {len(df)}")
print(f"Unique IDs: {df['ID'].nunique()}")
print(f"IDs with >1 NatCo: {num_multiple_NatCo_ids}")
print(f"IDs with missing NatCo: {num_missing_NatCo_ids}")
print(f"Unique NatCo values: {len(unique_NatCo)}")


IDs with >1 unique NatCo: 432
             ID  UniqueNatCos
255   C025L21C0             2
2825  C036F43X0             2
5584  C060E0000             2
5832  C07209430             2
5835  C07222660             2

IDs with missing/invalid NatCo: 0

Unique NatCo values (126 total):
['025', '036', '040', '044', '048', '050', '052', '056', '060', '068', '070', '072', '076', '092', '0Ns', '100', '116', '120', '124', '136', '152', '156', '175', '178', '182', '191', '196', '203', '208', '218', '220', '222', '233', '234', '242', '246', '250', '268', '275', '280', '288', '300', '320', '328', '340', '344', '350', '352', '356', '366', '369', '372', '376', '380', '388', '392', '398', '400', '404', '410', '414', '422', '428', '440', '442', '446', '454', '458', '470', '480', '484', '496', '499', '504', '516', '528', '554', '562', '566', '578', '582', '586', '591', '593', '597', '608', '617', '620', '634', '642', '643', '646', '682', '686', '688', '702', '703', '704', '705', '710', '724', '730', '736',

#### Get Summary Statistics of Frequency of Nations

In [59]:
# Summary:
# This cell loads the cleaned NationCode dataset and a reference table of country codes.
# It computes how frequently each NatCo value appears, merges these counts with descriptive
# country names, sorts the results in descending order of frequency, and writes a summary
# table to disk. The transformation steps focus on: counting occurrences, enriching the
# counts with metadata, and producing a sorted output for inspection.

data_path = f'{Temp_file_path_EoC}/filtered_NationCode_v2.txt'
codes_path = f'{Input_file_path}/CountryCodes.txt'
output_path = f'{Temp_file_path_EoC}/NationCode_summary.txt'

# Load the cleaned dataset containing NatCo and related fields
df = pd.read_csv(data_path, sep="|", dtype=str)

# Load the country code reference file providing descriptive names for each NatCo
df_codes = pd.read_csv(codes_path, sep="|", dtype=str)

# Count how often each NatCo value appears in the dataset,
# including NaN and empty values if present
summary = (
    df["NatCo"]
    .value_counts(dropna=False)       # count occurrences of each NatCo value
    .reset_index()                    # convert to DataFrame with column names
)
summary.columns = ["NatCo", "Count"]  # rename columns for clarity

# Merge the frequency table with the country codes reference data
# to append country names or descriptions for each NatCo
summary = summary.merge(df_codes, on="NatCo", how="left")

# Sort the merged summary so that the most frequent NatCo codes appear first
summary = summary.sort_values(by="Count", ascending=False)

# Save the summarized frequency table to disk as a pipe-delimited file
summary.to_csv(output_path, sep="|", index=False, encoding="utf-8")

# Display the output path and a preview (top 10) of the summary
print("Country summary saved to:", output_path)
print("\nSummary statistics (top 10):")
print(summary.head(10))

# Show how many distinct NatCo values exist in the summary table
print("\nTotal unique country codes:", len(summary))


Country summary saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/NationCode_summary.txt

Summary statistics (top 10):
  NatCo  Count     ImplCountry
0   840  33054   United States
1   124   8077          Canada
2   156   7701           China
3   826   6678  United Kingdom
4   392   6446           Japan
5   356   5843           India
6   036   3974       Australia
7   410   3759     South Korea
8   344   3285       Hong Kong
9   760   2905          Taiwan

Total unique country codes: 126


#### Keep Only Newest Nation and Remove "0N" Errors

In [60]:
# Summary:
# This cell loads the cleaned NationCode file, converts PIT DATE into datetime,
# sorts records so the newest entry per ID appears first, and selects exactly one
# record per ID according to the rule:
#   - Prefer the newest record where NatCo != "0Ns"
#   - If none exist, keep the newest "0Ns" record
# The cell also reports how many rows were filtered out during the selection.

input_path  = f'{Temp_file_path_EoC}/filtered_NationCode_v2.txt'
output_path = f'{Temp_file_path_EoC}/NationCodes_clean.txt'

# Load the dataset
df = pd.read_csv(input_path, sep="|", dtype=str)

# Convert PIT DATE to datetime for reliable sorting
df["PIT DATE"] = pd.to_datetime(df["PIT DATE"], errors="coerce")

# Sort newest→oldest within each ID
df = df.sort_values(by=["ID", "PIT DATE"], ascending=[True, False])

# Track initial row count
initial_row_count = len(df)

# Flag rows where NatCo is considered valid (anything except "0Ns")
df["_is_valid_natco"] = df["NatCo"] != "0Ns"

# Newest valid record per ID (if exists)
valid = df[df["_is_valid_natco"]]
valid_latest = (
    valid.groupby("ID", group_keys=False)
         .head(1)  # newest per ID
)

# Newest fallback "0Ns" record per ID (only those IDs that had no valid record)
fallback = df[~df["_is_valid_natco"]]
fallback = fallback[~fallback["ID"].isin(valid_latest["ID"])]
fallback_latest = (
    fallback.groupby("ID", group_keys=False)
            .head(1)
)

# Combine valid + fallback
df_clean = pd.concat([valid_latest, fallback_latest], ignore_index=True)

# Track counts
rows_after_selection = len(df_clean)
rows_removed = initial_row_count - rows_after_selection
valid_kept = len(valid_latest)
fallback_kept = len(fallback_latest)

# Drop helper columns
df_clean = df_clean.drop(columns=["PIT DATE", "_is_valid_natco"], errors="ignore")

# Sort output for readability
df_clean = df_clean.sort_values("ID").reset_index(drop=True)

# Save cleaned dataset
df_clean.to_csv(output_path, sep="|", index=False, encoding="utf-8")

# Display stats
print(f"Cleaned NatCo file saved to: {output_path}\n")
print("=== Filtering Statistics ===")
print(f"Total rows before filtering:   {initial_row_count:,}")
print(f"Rows kept as valid NatCo:      {valid_kept:,}")
print(f"Rows kept as fallback '0Ns':   {fallback_kept:,}")
print(f"Total rows in final output:    {rows_after_selection:,}")
print(f"Total rows removed:            {rows_removed:,}\n")

# Show sample output
print(df_clean.head())


Cleaned NatCo file saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/NationCodes_clean.txt

=== Filtering Statistics ===
Total rows before filtering:   121,852
Rows kept as valid NatCo:      121,008
Rows kept as fallback '0Ns':   0
Total rows in final output:    121,008
Total rows removed:            844

          ID NatCo
0  C00948205   840
1  C02500770   025
2  C0250077A   025
3  C0250077B   025
4  C0250077C   025


### CompanyName

#### Save only the most recent company name

In [61]:
# Summary:
# This cell loads company name data, validates required columns, and selects the most
# recent valid company name per ID. A name is considered invalid if it equals "n'"
# (case-insensitive, whitespace ignored). The logic keeps the latest non-invalid name
# based on PIT DATE, drops IDs where all names are invalid, cleans the company names,
# filters out IDs that end with a letter, and finally writes the cleaned result to file.

# Define input and output file paths
names_file       = f'{Temp_file_path_EoC}/filtered_CompanyName.txt'
names_clean_file = f'{Temp_file_path_EoC}/CompanyName_clean.txt'

# Load the names dataset using pipe delimiter; skip problematic lines instead of failing
names_df = pd.read_csv(names_file, sep="|", dtype=str, encoding="utf-8", on_bad_lines="skip")

# Validate that required columns exist; raise an error if any are missing
expected_cols = {"ID", "PIT DATE", "Name"}
missing = expected_cols - set(names_df.columns)
if missing:
    raise KeyError(f"Missing expected columns: {missing}. Found columns: {list(names_df.columns)}")

# Print the number of distinct IDs present in the raw data
initial_unique_ids = names_df["ID"].nunique()
print(f"Initial unique IDs in {names_file}: {initial_unique_ids:,}")

# Convert PIT DATE to a sortable numeric format (YYYYMMDD → integer); invalid values become NaN
names_df["PIT_DATE_num"] = pd.to_numeric(names_df["PIT DATE"], errors="coerce")

# Mark rows where Name is exactly "n'" (ignoring case and whitespace)
names_df["_is_bad_name"] = names_df["Name"].astype(str).str.strip().str.lower().eq("n'")

# Sort by ID and PIT_DATE_num so that within each ID the newest rows are last
names_df_sorted = names_df.sort_values(["ID", "PIT_DATE_num"], kind="mergesort")

# Keep only rows with valid names (i.e., Name != "n'")
valid_names = names_df_sorted[~names_df_sorted["_is_bad_name"]]

# For each ID, select the newest valid name (last row in each ID group)
names_latest = (
    valid_names
    .groupby("ID", as_index=False)
    .tail(1)
    .drop(columns=["PIT_DATE_num", "_is_bad_name"])
    .reset_index(drop=True)
)

# Print unique ID count after dropping invalid-only IDs and keeping newest valid names
unique_ids_after_latest_valid = names_latest["ID"].nunique()
print(f"Unique IDs after keeping latest valid name per ID: {unique_ids_after_latest_valid:,}")

# Rename the Name column for clarity
names_latest = names_latest.rename(columns={"Name": "CompanyName"})

# Clean the CompanyName column by removing leading apostrophes and stripping whitespace
names_latest["CompanyName"] = (
    names_latest["CompanyName"]
    .astype("string")
    .str.replace(r"^'+", "", regex=True)
    .str.strip()
)

# Retain only ID and CompanyName for the output dataset
names_latest_out = names_latest[["ID", "CompanyName"]].copy()

# Identify IDs that end with a letter and exclude them
initial_rows_before_id_filter = len(names_latest_out)
mask_ids_ending_with_letter = names_latest_out["ID"].astype(str).str[-1].str.isalpha()
names_latest_filtered_id = names_latest_out[~mask_ids_ending_with_letter].copy()
excluded_rows_id_filter = initial_rows_before_id_filter - len(names_latest_filtered_id)

print(f"Rows excluded due to ID ending with a letter: {excluded_rows_id_filter:,}")

# Write the cleaned and filtered dataset to file
names_latest_filtered_id.to_csv(names_clean_file, sep="|", index=False, encoding="utf-8")

# Show a preview and confirm save
print(names_latest_filtered_id.head())
print(f"Saved cleaned names to {names_clean_file} (rows: {len(names_latest_filtered_id):,})")


Initial unique IDs in /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_CompanyName.txt: 121,008
Unique IDs after keeping latest valid name per ID: 121,008
Rows excluded due to ID ending with a letter: 16,947
          ID                               CompanyName
0  C00948205             AGRIFORCE GROWING SYSTEMS LTD
1  C02500770            PEUGEOT CITROEN ARGENTINA S.A.
5  C02520200  ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA
8  C02520220                       ALPARGATAS S.A.I.C.
9  C02520230               ALUAR ALUMINIO ARGENTINO SA
Saved cleaned names to /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/CompanyName_clean.txt (rows: 104,061)


### ADR

#### Merge ADRs to the Company Names to Understand DF

In [62]:
# This cell merges ADR indicator data with cleaned company names on ID.
# It also counts how many unique IDs exist inside the ADR dataset before merging.
# The full workflow includes:
# 1) Define input and output file paths for ADR indicator data and company name data.
# 2) Load both datasets as pipe-delimited text files with all columns as strings.
# 3) Compute how many unique IDs appear in the ADR dataset.
# 4) Perform a left join on "ID" to attach company names to each ADR record.
# 5) Remove the optional technical column "ItemCode" if present.
# 6) Export the cleaned merged dataset.
# 7) Print summary information including unique ADR ID count, row count, and preview.

# --- Paths ---
company_file = f'{Temp_file_path_EoC}/CompanyName_clean.txt'
adr_file     = f'{Temp_file_path_EoC}/filtered_ADRIndicator.txt'
merged_out   = f'{Temp_file_path_EoC}/filtered_ADRIndicator_v2.txt'

# --- Load datasets ---
company_df = pd.read_csv(company_file, sep="|", dtype=str, encoding="utf-8", on_bad_lines="skip")
adr_df     = pd.read_csv(adr_file,     sep="|", dtype=str, encoding="utf-8", on_bad_lines="skip")

# --- Count unique IDs in the ADR dataset ---
unique_adr_ids = adr_df['ID'].nunique()              # Number of distinct IDs in ADR data
print(f"Number of unique ADR IDs: {unique_adr_ids:,}")

# --- Merge on ID ---
test_merged_df = pd.merge(adr_df, company_df, on="ID", how="left")

# --- Drop unnecessary technical columns if they exist ---
cols_to_drop = ["ItemCode"]
existing_to_drop = [c for c in cols_to_drop if c in test_merged_df.columns]
if existing_to_drop:
    test_merged_df = test_merged_df.drop(columns=existing_to_drop)

# --- Export merged DataFrame ---
test_merged_df.to_csv(merged_out, sep="|", index=False, encoding="utf-8")

# --- Display result summary ---
print(f"Merged test dataframe saved to: {merged_out} (rows: {len(test_merged_df):,})")
print("=== Head of merged DataFrame ===")
print(test_merged_df.head(10))


Number of unique ADR IDs: 2,297
Merged test dataframe saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_ADRIndicator_v2.txt (rows: 2,337)
=== Head of merged DataFrame ===
          ID    PIT DATE ADRIndicator                  CompanyName
0  C036F63D0  2012-05-28           'X  BNK BANKING CORPORATION LTD
1  C036F63D0  2014-01-15           N'  BNK BANKING CORPORATION LTD
2  C056879S0  2018-04-23           'X      ACACIA PHARMA GROUP PLC
3  C2461T100  2004-04-23           'X        BIOTIE THERAPIES CORP
4  C2461T100  2005-06-10           N'        BIOTIE THERAPIES CORP
5  C2504O500  2010-06-30           'X   FRANCE LOCATION EQUIPEMENT
6  C2504O500  2011-06-29           N'   FRANCE LOCATION EQUIPEMENT
7  C250C9180  2010-10-20           'X                FASHION B AIR
8  C250C9180  2011-06-29           N'                FASHION B AIR
9  C344BU720  2002-08-23           'X            VIVA GOODS CO LTD


#### Keep only the newest row per ID in the merged df

In [63]:
# This cell loads a merged ADR/company dataset, parses the PIT DATE column into a comparable
# datetime representation using a flexible parser, and then selects the newest record per ID
# based on this parsed date. The result is a filtered dataset where each ID appears only once,
# representing its most recent PIT DATE entry, which is then saved to an output file.

# Paths
merged_in  = f'{Temp_file_path_EoC}/filtered_ADRIndicator_v2.txt'
latest_out = f'{Temp_file_path_EoC}/filtered_ADRIndicator_v3.txt'

# Load merged dataset as strings to avoid unwanted type inference
df = pd.read_csv(merged_in, sep="|", dtype=str, encoding="utf-8", on_bad_lines="skip")

# Function to parse PIT DATE values into datetime, handling multiple possible formats
def parse_pit_date(s: pd.Series) -> pd.Series:
    # Convert to string, strip whitespace to normalize the raw date values
    s = s.astype(str).str.strip()
    try:
        # Try using a flexible datetime parser that can handle mixed formats directly
        return pd.to_datetime(s, errors="coerce", format="mixed")
    except TypeError:
        # Fallback branch for environments/pandas versions without 'format="mixed"'
        # Create a mask for values in strict YYYYMMDD format
        mask_8 = s.str.fullmatch(r"\d{8}", na=False)
        # Initialize a datetime Series with NaT values
        dt = pd.to_datetime(pd.Series([None]*len(s)), errors="coerce")
        # Parse strictly formatted YYYYMMDD values
        dt.loc[mask_8] = pd.to_datetime(s.loc[mask_8], format="%Y%m%d", errors="coerce")
        # For remaining entries, try a more generic parser with inference
        rem = ~mask_8
        if rem.any():
            dt.loc[rem] = pd.to_datetime(s.loc[rem], errors="coerce", infer_datetime_format=True)
        # Return the combined datetime Series used for ordering by PIT DATE
        return dt

# Compute newest per ID:
# 1) Parse PIT DATE into a datetime helper column
df["_PIT_dt"] = parse_pit_date(df["PIT DATE"])

# 2) Sort by ID and PIT DATE so that within each ID, rows are ordered chronologically
df_sorted = df.sort_values(["ID", "_PIT_dt"], ascending=[True, True], kind="mergesort")

# 3) For each ID, keep the last row (newest PIT DATE) and drop the helper column
test_merged_latest_df = (
    df_sorted
    .groupby("ID", as_index=False)
    .tail(1)                          # newest row per ID due to ascending sort
    .drop(columns=["_PIT_dt"])        # remove helper datetime column
    .reset_index(drop=True)           # clean up index after filtering
)

# Save the per-ID newest-record dataset and print a small preview
test_merged_latest_df.to_csv(latest_out, sep="|", index=False, encoding="utf-8")
print(f"Saved newest-per-ID merged dataset to: {latest_out} (rows: {len(test_merged_latest_df):,})")
print(test_merged_latest_df.head(10))


Saved newest-per-ID merged dataset to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_ADRIndicator_v3.txt (rows: 2,297)
          ID    PIT DATE ADRIndicator                  CompanyName
0  C036F63D0  2014-01-15           N'  BNK BANKING CORPORATION LTD
1  C056879S0  2018-04-23           'X      ACACIA PHARMA GROUP PLC
2  C2461T100  2005-06-10           N'        BIOTIE THERAPIES CORP
3  C2504O500  2011-06-29           N'   FRANCE LOCATION EQUIPEMENT
4  C250C9180  2011-06-29           N'                FASHION B AIR
5  C344BU720  2002-10-04           N'            VIVA GOODS CO LTD
6  C344G3820  2002-10-04           N'       CHINA GAS HOLDINGS LTD
7  C392WZ700  2014-01-15           N'               MEDICINOVA INC
8  C410FVF00  2011-06-29           N'               SOFTCEN CO LTD
9  C484B3960  2009-07-23           N'      GRUPO ELEKTRA SAB DE CV


#### Clean "ADRIndicator" column and drop unnecessary columns

In [64]:
# This cell loads the latest ADRIndicator data, cleans and normalizes the ADRIndicator values,
# removes columns that are no longer needed, and then writes a compact ADR-only dataset.
#
# Main transformations and checks:
# - Reads the ADR file (pipe-delimited) as strings to preserve formats and avoid type coercion.
# - Verifies that the expected 'ADRIndicator' column exists and raises an error if missing.
# - Cleans the ADRIndicator column by:
#     * Stripping surrounding whitespace.
#     * Removing a leading apostrophe directly before an 'X' (e.g., "'X" → "X").
#     * Removing a trailing apostrophe directly after an 'N' (e.g., "N'" → "N").
# - Drops auxiliary columns ('PIT DATE', 'CompanyName') if present, keeping only the fields
#   necessary for the downstream ADR analysis.
# - Saves the cleaned dataset and prints basic diagnostics:
#     * First rows of the cleaned frame.
#     * Total count of IDs and the number of unique IDs.

# --- Paths ---
adr_in  = f'{Temp_file_path_EoC}/filtered_ADRIndicator_v3.txt'
adr_out = f'{Temp_file_path_EoC}/ADR_clean.txt'

# --- Load ADR dataset ---
# Read the ADRIndicator dataset from the pipe-delimited file, keeping all columns as strings.
# on_bad_lines="skip" ensures that malformed lines are skipped instead of raising an error.
df = pd.read_csv(adr_in, sep="|", dtype=str, encoding="utf-8", on_bad_lines="skip")

# --- Clean ADRIndicator column ---
# Ensure the required ADRIndicator column is available; abort with a clear error if not.
if "ADRIndicator" not in df.columns:
    raise KeyError("Expected column 'ADRIndicator' not found in the input file.")

# Normalize ADRIndicator values by fixing common formatting artifacts.
df["ADRIndicator"] = (
    df["ADRIndicator"]
    .astype("string")                     # work with pandas' string dtype for vectorized operations
    .str.strip()                          # remove leading and trailing whitespace
    .str.replace(r"^'+X", "X", regex=True)   # remove one or more leading apostrophes directly before X (e.g. "'X" -> "X")
    .str.replace(r"N'+$", "N", regex=True)   # remove one or more trailing apostrophes directly after N (e.g. "N'" -> "N")
)

# --- Drop unnecessary columns if they exist ---
# Define columns that are not needed anymore for the ADR-specific output.
cols_to_drop = ["PIT DATE", "CompanyName"]
# Keep only those columns from the list that are actually present in the DataFrame.
existing_to_drop = [c for c in cols_to_drop if c in df.columns]
# Drop the identified columns if there are any to remove.
if existing_to_drop:
    df = df.drop(columns=existing_to_drop)

# --- Save cleaned dataset ---
# Write the cleaned ADR dataset back to disk as a pipe-delimited file without the index column.
df.to_csv(adr_out, sep="|", index=False, encoding="utf-8")

print(f"Cleaned ADRIndicator values saved to: {adr_out}")
print(df.head(10))

# --- Count IDs ---
# Compute the total number of ID records and the number of distinct IDs in the cleaned dataset.
count_ids = len(df["ID"])
count_unique_ids = df["ID"].nunique()

# --- Print results ---
# Provide a brief summary of the ID distribution after cleaning.
print(f"Count IDs: {count_ids:,}")
print(f"Count unique IDs: {count_unique_ids:,}")


Cleaned ADRIndicator values saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/ADR_clean.txt
          ID ADRIndicator
0  C036F63D0            N
1  C056879S0            X
2  C2461T100            N
3  C2504O500            N
4  C250C9180            N
5  C344BU720            N
6  C344G3820            N
7  C392WZ700            N
8  C410FVF00            N
9  C484B3960            N
Count IDs: 2,297
Count unique IDs: 2,297


### SIC

#### Clean SIC Row, i.e., Remove Ns and the "S" Prefix

In [65]:
import pandas as pd

# =============================================================================
# SUMMARY — FILTERED SIC PROCESSING
# =============================================================================
# This script:
#   1. Loads the file: filtered_SIC.txt
#   2. Cleans the SIC column by:
#        - Removing leading 'S' (e.g., S1000 → 1000)
#        - Removing rows where SIC is:
#            * "Ns"
#            * Empty
#            * Non-numeric
#   3. Drops the ItemCode column entirely
#   4. Saves the cleaned dataset as: filtered_SIC_v2.txt
#   5. Prints:
#        - Before/after row counts
#        - Number of removed rows
#        - Number of unique IDs in the final dataset
#        - A preview of the cleaned data
# =============================================================================


# Input / Output paths
INPUT_FILE = f'{Temp_file_path_EoC}/filtered_SIC.txt'
OUTPUT_FILE = f'{Temp_file_path_EoC}/filtered_SIC_v2.txt'

# -----------------------------------------------------------------------------
# Load the file
# -----------------------------------------------------------------------------
df = pd.read_csv(
    INPUT_FILE,
    sep="|",
    encoding="utf-8",
    dtype=str,
    keep_default_na=True
)

print("\n=== Original Preview ===")
display(df.head())

# -----------------------------------------------------------------------------
# 1. Remove leading 'S' from SIC column
# -----------------------------------------------------------------------------
df["SIC"] = df["SIC"].astype("string").str.strip()
df["SIC"] = df["SIC"].str.lstrip("S")

# -----------------------------------------------------------------------------
# 2. Remove invalid SIC values:
#    - "Ns"
#    - Empty
#    - Non-numeric
# -----------------------------------------------------------------------------
valid_mask = (
    df["SIC"].notna() &
    (df["SIC"].str.strip() != "") &
    (df["SIC"].str.lower() != "ns") &
    (df["SIC"].str.isnumeric())
)

df_clean = df[valid_mask].copy()

# -----------------------------------------------------------------------------
# 3. Drop ItemCode column
# -----------------------------------------------------------------------------
if "ItemCode" in df_clean.columns:
    df_clean.drop(columns=["ItemCode"], inplace=True)
    print("Column 'ItemCode' dropped.")
else:
    print("Column 'ItemCode' not found — nothing to drop.")

# -----------------------------------------------------------------------------
# 4. Save cleaned data
# -----------------------------------------------------------------------------
df_clean.to_csv(
    OUTPUT_FILE,
    sep="|",
    index=False,
    encoding="utf-8"
)

# -----------------------------------------------------------------------------
# 5. Summary + Unique ID count
# -----------------------------------------------------------------------------
rows_before = len(df)
rows_after = len(df_clean)
removed_rows = rows_before - rows_after
unique_ids_final = df_clean["ID"].nunique()

print(f"\nRows before cleaning: {rows_before}")
print(f"Rows after cleaning:  {rows_after}")
print(f"Removed rows:         {removed_rows}")
print(f"Unique IDs (final):   {unique_ids_final}")
print(f"Saved to: {OUTPUT_FILE}")

print("\n=== Cleaned Preview ===")
display(df_clean.head())



=== Original Preview ===


Unnamed: 0,ID,PIT DATE,ItemCode,SIC
0,C02500770,1995-12-29,7021,S3711
1,C02520200,1996-05-03,7021,S3312
2,C02520200,2001-11-30,7021,S3321
3,C02520200,2002-11-15,7021,S3317
4,C02520200,2005-04-15,7021,S3312


Column 'ItemCode' dropped.

Rows before cleaning: 227315
Rows after cleaning:  226463
Removed rows:         852
Unique IDs (final):   103693
Saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_SIC_v2.txt

=== Cleaned Preview ===


Unnamed: 0,ID,PIT DATE,SIC
0,C02500770,1995-12-29,3711
1,C02520200,1996-05-03,3312
2,C02520200,2001-11-30,3321
3,C02520200,2002-11-15,3317
4,C02520200,2005-04-15,3312


#### Remove Duplicates Entirely When SIC Between 6000-7000 (Financial Firms) or >= 9000 (Public Administration & Non-classifiable)

In [66]:
import pandas as pd

# =============================================================================
# SUMMARY — FILTERED SIC PROCESSING (VERSION 3)
# =============================================================================
# This script:
#   1. Loads the file: filtered_SIC_v2.txt
#   2. Converts SIC to numeric (safely).
#   3. Identifies "bad" SIC values:
#        - 6000 <= SIC <= 7000  (inclusive)
#        - SIC >= 9000
#   4. Finds all IDs that have at least one "bad" SIC value.
#   5. Removes *all rows* for those IDs from the dataset.
#   6. Saves:
#        - Cleaned dataset as: filtered_SIC_v3.txt
#        - Dropped IDs + their rows as: Dropped_SIC.txt
#   7. Prints:
#        - Rows before/after
#        - Unique IDs before/after
#        - Number of unique IDs dropped due to the SIC rule
# =============================================================================

# Input / Output paths
INPUT_FILE = f'{Temp_file_path_EoC}/filtered_SIC_v2.txt'
OUTPUT_FILE = f'{Temp_file_path_EoC}/filtered_SIC_v3.txt'
DROPPED_FILE = f'{Temp_file_path_EoC}/Dropped_SIC.txt'

# -----------------------------------------------------------------------------
# 1. Load v2
# -----------------------------------------------------------------------------
df_v2 = pd.read_csv(
    INPUT_FILE,
    sep="|",
    encoding="utf-8",
    dtype=str,
    keep_default_na=True
)

print("\n=== Preview of filtered_SIC_v2.txt ===")
display(df_v2.head())

rows_before = len(df_v2)
unique_ids_before = df_v2["ID"].nunique()

# -----------------------------------------------------------------------------
# 2. Ensure SIC is numeric
# -----------------------------------------------------------------------------
sic_numeric = pd.to_numeric(df_v2["SIC"], errors="coerce")

# -----------------------------------------------------------------------------
# 3. Row-level mask: identify "bad" SIC values
#    - 6000 <= SIC <= 7000
#    - SIC >= 9000
# -----------------------------------------------------------------------------
bad_sic_mask = ((sic_numeric >= 6000) & (sic_numeric <= 7000)) | (sic_numeric >= 9000)

# IDs that have at least one "bad" SIC
ids_with_bad_sic = df_v2.loc[bad_sic_mask, "ID"].unique()
num_ids_dropped_due_to_sic = len(ids_with_bad_sic)

# -----------------------------------------------------------------------------
# 4. Create dropped dataset (FULL rows for dropped IDs)
# -----------------------------------------------------------------------------
df_dropped = df_v2[df_v2["ID"].isin(ids_with_bad_sic)].copy()

# -----------------------------------------------------------------------------
# 5. Remove ALL rows for those IDs from main dataset
# -----------------------------------------------------------------------------
df_v3 = df_v2[~df_v2["ID"].isin(ids_with_bad_sic)].copy()

rows_after = len(df_v3)
unique_ids_after = df_v3["ID"].nunique()

# -----------------------------------------------------------------------------
# 6. Save outputs
# -----------------------------------------------------------------------------
df_v3.to_csv(
    OUTPUT_FILE,
    sep="|",
    index=False,
    encoding="utf-8"
)

df_dropped.to_csv(
    DROPPED_FILE,
    sep="|",
    index=False,
    encoding="utf-8"
)

# -----------------------------------------------------------------------------
# 7. Summary
# -----------------------------------------------------------------------------
print("\n=== FILTERING SUMMARY (V3) ===")
print(f"Rows before filtering:              {rows_before}")
print(f"Rows after filtering:               {rows_after}")
print(f"Rows removed:                       {rows_before - rows_after}")
print(f"Unique IDs before filtering:        {unique_ids_before}")
print(f"Unique IDs after filtering:         {unique_ids_after}")
print(f"Unique IDs dropped (SIC rule):      {num_ids_dropped_due_to_sic}")
print(f"Saved cleaned file to:              {OUTPUT_FILE}")
print(f"Saved dropped IDs file to:          {DROPPED_FILE}")

print("\n=== Preview of filtered_SIC_v3.txt ===")
display(df_v3.head())

print("\n=== Preview of Dropped_SIC.txt ===")
display(df_dropped.head())



=== Preview of filtered_SIC_v2.txt ===


Unnamed: 0,ID,PIT DATE,SIC
0,C02500770,1995-12-29,3711
1,C02520200,1996-05-03,3312
2,C02520200,2001-11-30,3321
3,C02520200,2002-11-15,3317
4,C02520200,2005-04-15,3312



=== FILTERING SUMMARY (V3) ===
Rows before filtering:              226463
Rows after filtering:               160946
Rows removed:                       65517
Unique IDs before filtering:        103693
Unique IDs after filtering:         77573
Unique IDs dropped (SIC rule):      26120
Saved cleaned file to:              /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_SIC_v3.txt
Saved dropped IDs file to:          /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/Dropped_SIC.txt

=== Preview of filtered_SIC_v3.txt ===


Unnamed: 0,ID,PIT DATE,SIC
0,C02500770,1995-12-29,3711
1,C02520200,1996-05-03,3312
2,C02520200,2001-11-30,3321
3,C02520200,2002-11-15,3317
4,C02520200,2005-04-15,3312



=== Preview of Dropped_SIC.txt ===


Unnamed: 0,ID,PIT DATE,SIC
12,C02520250,1997-10-31,6021
13,C02520250,2010-08-12,6029
14,C02520260,1997-08-29,6021
15,C02520260,2010-06-18,6029
23,C02520290,1995-12-29,2911


#### Get Unique IDs from SIC Filter

In [67]:
import pandas as pd

# =============================================================================
# SUMMARY — CREATE CLEAN UNIQUE SIC ID FILE
# =============================================================================
# This script:
#   1. Loads the file: filtered_SIC_v3.txt
#   2. Extracts all UNIQUE IDs only
#   3. Sorts the IDs alphabetically
#   4. Drops all other columns
#   5. Saves the result as: SIC_clean.txt
#   6. Prints a short summary and preview
# =============================================================================

# Input / Output paths
INPUT_FILE = f'{Temp_file_path_EoC}/filtered_SIC_v3.txt'
OUTPUT_FILE = f'{Temp_file_path_EoC}/SIC_clean.txt'

# -----------------------------------------------------------------------------
# 1. Load v3
# -----------------------------------------------------------------------------
df_v3 = pd.read_csv(
    INPUT_FILE,
    sep="|",
    encoding="utf-8",
    dtype=str,
    keep_default_na=True
)

print("\n=== Preview of filtered_SIC_v3.txt ===")
display(df_v3.head())

rows_before = len(df_v3)
unique_ids_before = df_v3["ID"].nunique()

# -----------------------------------------------------------------------------
# 2. Extract, deduplicate, and sort unique IDs
# -----------------------------------------------------------------------------
df_clean_ids = (
    df_v3[["ID"]]
    .drop_duplicates()
    .sort_values(by="ID")
    .reset_index(drop=True)
)

rows_after = len(df_clean_ids)

# -----------------------------------------------------------------------------
# 3. Save clean ID file
# -----------------------------------------------------------------------------
df_clean_ids.to_csv(
    OUTPUT_FILE,
    sep="|",
    index=False,
    encoding="utf-8"
)

# -----------------------------------------------------------------------------
# 4. Summary
# -----------------------------------------------------------------------------
print("\n=== SIC CLEAN FILE SUMMARY ===")
print(f"Rows in v3 file:          {rows_before}")
print(f"Unique IDs in v3 file:   {unique_ids_before}")
print(f"Rows written to clean:   {rows_after}")
print(f"Saved clean ID file to:  {OUTPUT_FILE}")

print("\n=== Preview of SIC_clean.txt ===")
display(df_clean_ids.head())



=== Preview of filtered_SIC_v3.txt ===


Unnamed: 0,ID,PIT DATE,SIC
0,C02500770,1995-12-29,3711
1,C02520200,1996-05-03,3312
2,C02520200,2001-11-30,3321
3,C02520200,2002-11-15,3317
4,C02520200,2005-04-15,3312



=== SIC CLEAN FILE SUMMARY ===
Rows in v3 file:          160946
Unique IDs in v3 file:   77573
Rows written to clean:   77573
Saved clean ID file to:  /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/SIC_clean.txt

=== Preview of SIC_clean.txt ===


Unnamed: 0,ID
0,C02500770
1,C02520200
2,C02520220
3,C02520230
4,C02520240


### ID

#### Overview of Unique IDs

In [68]:
# This cell reads the large fundamentals file in chunks and extracts all unique IDs.
# It performs a filtering operation by selecting only the "ID" column during loading,
# reducing memory footprint and ensuring the code processes only relevant data.
# The transformation involves accumulating unique IDs across all chunks and
# writing the deduplicated list into a text file for downstream processing.

output_path = f'{Temp_file_path_EoC}/filtered_ids.txt'

# Define column names so the CSV parser assigns correct labels
column_names = ["ID", "PIT Date", "Frequency", "FiscalPeriod", "ItemCode", "Value"]

# Number of rows to load per chunk. This helps handle large files efficiently.
chunk_size = 1_000_000

# Set to store unique IDs without duplicates
unique_ids = set()

# Read the large dataset in chunks, selecting only the ID column for efficiency
for chunk in pd.read_csv(
    Fundamentals_clean_file_path,
    sep="|",                   # Pipe-separated file
    header=0,                 # Use the first row as header
    names=column_names,       # Assign predefined column names
    usecols=["ID"],           # Load only the ID column to reduce memory usage
    dtype={"ID": "string"},   # Ensure IDs are treated as strings
    chunksize=chunk_size,     # Load in controlled chunk sizes
    engine="python"           # Use Python engine for flexibility with separators
):
    # Extract unique IDs in the current chunk, drop missing values, and update the global set
    unique_ids.update(chunk["ID"].dropna().unique())

# Print the number of unique IDs collected
print("Unique IDs:", len(unique_ids))

# Write all unique IDs into the output text file in sorted order
with open(output_path, "w") as f:
    for uid in sorted(unique_ids):
        f.write(f"{uid}\n")

print(f"Unique IDs saved to: {output_path}")


Unique IDs: 120815
Unique IDs saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_ids.txt


#### Check Invalid IDs => Only 1 => Fixed Manually and Considered as Company (WSID = C00948205, i.e., CUSIP of the Company for Some Reason)

In [69]:
# This cell validates structural quality of previously extracted IDs.
# It performs filtering logic by checking each ID against multiple rules:
# - must be alphanumeric
# - must be exactly 9 characters long
# - must start with the letter 'C'
# - must not end with digits 1–9
# The transformation step assigns error categories to IDs that fail any rule,
# aggregates these issues, produces summary statistics, and writes all invalid
# IDs with error reasons to a text file for downstream analysis.

# Define the path to the file containing unique IDs and the output file for invalid IDs
input_path = f'{Temp_file_path_EoC}/filtered_ids.txt'
output_path = f"{Temp_file_path_EoC}/error_ids.txt"

# Initialize counters for different types of errors found during validation
count_ending_1to9 = 0        # IDs ending with digits 1–9
count_wrong_length = 0       # IDs not exactly 9 characters long
count_not_starting_C = 0     # IDs not starting with 'C'
count_non_alnum = 0          # IDs containing non-alphanumeric characters
total_count = 0              # Total number of IDs processed

# List to store invalid IDs together with the reason(s) they failed validation
error_ids = []

# Read and validate each ID from the input file
with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        id_str = line.strip()  # Remove newline and surrounding whitespace

        # Skip empty or whitespace-only lines
        if not id_str:
            continue

        # Remove potential UTF-8 BOM prefix if present
        if id_str.startswith("\ufeff"):
            id_str = id_str.lstrip("\ufeff")

        # Count every processed ID
        total_count += 1

        # Apply validation checks to the current ID
        has_non_alnum = not id_str.isalnum()                    # Contains characters other than letters/digits
        wrong_length = (len(id_str) != 9)                       # Must be exactly 9 characters long
        not_starting_C = not id_str.startswith("C")             # Must begin with 'C'
        ends_with_1to9 = id_str[-1].isdigit() and id_str[-1] != "0"  # Ending digit must not be 1–9

        # Collect error reasons for this ID
        reasons = []

        # Record each failed rule and increment corresponding count
        if has_non_alnum:
            reasons.append("non-alphanumeric chars")
            count_non_alnum += 1
        if wrong_length:
            reasons.append(f"length {len(id_str)} ≠ 9")
            count_wrong_length += 1
        if not_starting_C:
            reasons.append("does not start with 'C'")
            count_not_starting_C += 1
        if ends_with_1to9:
            reasons.append("ends with 1–9")
            count_ending_1to9 += 1

        # If there are any validation issues, store this ID and its reason(s)
        if reasons:
            error_ids.append((id_str, ", ".join(reasons)))

# Print aggregated validation statistics for quick overview
print(f"Total IDs: {total_count}")
print(f"Non-alphanumeric: {count_non_alnum}")
print(f"Wrong length (!= 9): {count_wrong_length}")
print(f"Not starting with 'C': {count_not_starting_C}")
print(f"Ending with 1–9: {count_ending_1to9}")
print(f"Total error IDs: {len(error_ids)}")
print(f"Percentage: {len(error_ids) / total_count * 100:.2f}%")

# Save all invalid IDs including error reasons into a separate output file
with open(output_path, "w", encoding="utf-8") as out_file:
    for uid, reason in error_ids:
        out_file.write(f"{uid}\t{reason}\n")

print(f"Saved {len(error_ids)} error IDs to: {output_path}")

# Show a few example invalid IDs for inspection
for uid, reason in error_ids[:10]:
    print(f"Example → {uid} : {reason}")


Total IDs: 120815
Non-alphanumeric: 0
Wrong length (!= 9): 0
Not starting with 'C': 0
Ending with 1–9: 1
Total error IDs: 1
Percentage: 0.00%
Saved 1 error IDs to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/error_ids.txt
Example → C00948205 : ends with 1–9


#### Get Number of Companies (Ending with 0 or 5 - 5 is an Error Name but Data is True)

In [70]:
# This cell filters the list of previously extracted IDs based on a specific pattern:
# it selects only those IDs that end with either '0' or '5'. This acts as a
# targeted filtering step to isolate a subset of IDs with a particular suffix rule.
# The transformation consists of collecting matching IDs, counting them, reporting
# proportions, and writing the filtered list to a new text file.

# Define input file containing all unique IDs and output file for filtered results
input_path = f'{Temp_file_path_EoC}/filtered_ids.txt'
output_path = f'{Temp_file_path_EoC}/filtered_company_ids.txt'

# Initialize counters
count_ending_0_or_5 = 0     # Number of IDs ending with 0 or 5
total_count = 0             # Total number of IDs processed

# List to store IDs that match the filter condition
ids_with_0_or_5 = []

# Read and evaluate each ID from the input file
with open(input_path, "r") as f:
    for line in f:
        id_str = line.strip()          # Remove whitespace and newline

        # Skip empty lines
        if id_str:
            total_count += 1           # Count each valid line

            # Check whether the ID ends with '0' or '5'
            if id_str.endswith(("0", "5")):
                count_ending_0_or_5 += 1
                ids_with_0_or_5.append(id_str)

# Print summary statistics of the filtering operation
print(f"Total IDs: {total_count}")
print(f"IDs ending with '0' or '5': {count_ending_0_or_5}")
print(f"Percentage: {count_ending_0_or_5 / total_count * 100:.2f}%")

# Save the filtered set of IDs into the output file
with open(output_path, "w") as out_file:
    for uid in ids_with_0_or_5:
        out_file.write(uid + "\n")


Total IDs: 120815
IDs ending with '0' or '5': 103963
Percentage: 86.05%


#### Get Number of Securities (Ending with Letter)

In [71]:
# This cell filters the list of IDs by selecting only those whose final character
# is an alphabetic letter (A–Z or a–z). This filtering step isolates IDs that
# follow a letter-suffix pattern and separates them into a dedicated output file.
# The transformation includes scanning each ID, checking its ending character,
# counting how many match the rule, calculating proportions, and writing all
# matching IDs to disk for downstream categorization.

# Define input path with all IDs and output path for the filtered subset
input_path = f'{Temp_file_path_EoC}/filtered_ids.txt'
output_path = f"{Temp_file_path_EoC}/filtered_security_ids.txt"

# Precompute the full set of alphabetic characters for quick membership checks
letters = set(string.ascii_letters)

# Initialize counters for summary statistics
total_count = 0               # Total number of IDs processed
count_ending_letter = 0       # Number of IDs ending with a letter

# Store IDs that meet the condition
ids_with_letter = []

# Process the input file line by line
with open(input_path, "r") as f:
    for line in f:
        id_str = line.strip()             # Remove surrounding whitespace and newline

        # Skip blank or whitespace-only lines
        if id_str:
            total_count += 1              # Count each valid ID

            # Check whether the last character is a letter
            if id_str[-1] in letters:
                count_ending_letter += 1
                ids_with_letter.append(id_str)

# Print summary of the filtering operation
print(f"Total IDs: {total_count}")
print(f"IDs ending with a letter: {count_ending_letter}")
print(f"Percentage: {count_ending_letter / total_count * 100:.2f}%")

# Write all matching IDs to the designated output file
with open(output_path, "w") as out_file:
    for uid in ids_with_letter:
        out_file.write(uid + "\n")


Total IDs: 120815
IDs ending with a letter: 16852
Percentage: 13.95%


#### Check, for Completeness

In [72]:
# This line performs a consistency check ensuring that all IDs have been
# fully partitioned into three categories:
# - IDs with validation errors
# - IDs ending with letters
# - IDs ending with 0 or 5
# The transformation here is a logical verification: it checks whether the sum
# of these three groups equals the total number of processed IDs. If the result
# is True, all IDs were accounted for without overlap or omission.

# Compare total items in all categorized groups against the total processed count
len(error_ids) + count_ending_letter + count_ending_0_or_5 == total_count


False

In [73]:
# This line performs a consistency check ensuring that all IDs have been
# fully partitioned into three categories:
# - IDs with validation errors
# - IDs ending with letters
# - IDs ending with 0 or 5
# The transformation here is a logical verification: it checks whether the sum
# of these three groups equals the total number of processed IDs. If the result
# is True, all IDs were accounted for without overlap or omission.

count_ending_letter + count_ending_0_or_5 == total_count

True

#### Verification of Implied Nation Code from the WSID through Match with True Country. Nation Code used for True Country as Country Name from the Current File is not Clean, i.e., Values like "Germany" and "Germayn" are Mixed up. Securities are Excluded.

##### Extract Implied Nation Code (ImplNatCo)

In [74]:
# This cell enriches the list of company-related IDs by extracting a specific
# substring (characters 2–4) from each ID. This acts as a transformation step
# that derives a new feature called "ImplNatCo". The code reads the IDs,
# performs substring extraction, builds a DataFrame pairing each ID with its
# derived value, and writes the result to a delimited text file for further
# analysis or categorization.

# Paths for input file (raw IDs) and output file (enhanced with substring field)
FILE = f'{Temp_file_path_EoC}/filtered_company_ids.txt'
OUTPUT = f'{Temp_file_path_EoC}/filtered_company_ids_v2.txt'

# Read all non-empty lines from the input file, stripping whitespace
with open(FILE, "r", encoding="utf-8") as f:
    ids = [line.strip() for line in f if line.strip()]

# Extract characters at index positions 1–3 (2nd to 4th character)
# If the ID is too short, return an empty string as fallback
impl_nat_co = [id_str[1:4] if len(id_str) >= 4 else "" for id_str in ids]

# Create DataFrame linking each ID with its derived substring
df = pd.DataFrame({
    "ID": ids,
    "ImplNatCo": impl_nat_co
})

# Save the result as a pipe-separated file for clarity and compatibility
df.to_csv(OUTPUT, sep="|", index=False, encoding="utf-8")

# Print output location and show the first few rows for inspection
print(f"File saved to: {OUTPUT}")
print("Preview:")
print(df.head())


File saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_company_ids_v2.txt
Preview:
          ID ImplNatCo
0  C00948205       009
1  C02500770       025
2  C02520200       025
3  C02520220       025
4  C02520230       025


#### Merge with Official Nation Codes from Current File
#### Comment:
#### NatCo 009 Wrong => Checked and Changed to Canada (Place of Listing)
#### NatCo 288 Wrong => Checked and Changed to Ghana (Place of Listing)
#### NatCo 466 Wrong => Checked and Changed to Macau (Place of Listing)

In [75]:
# This cell combines two datasets to validate whether the derived ImplNatCo codes
# match the official NatCo values. It performs several data transformations:
# - Cleans column names and ID fields
# - Merges the two datasets on the ID column
# - Compares the ImplNatCo substring with the NatCo column
# - Classifies each row as match or mismatch
# - Outputs both the full merged data and a separate file containing only mismatches
# This workflow applies filtering by selecting mismatched rows and transformation
# by deriving new comparison fields.

# Define input and output file paths
unique_ids_path = f'{Temp_file_path_EoC}/filtered_company_ids_v2.txt'
natco_path = f"{Temp_file_path_EoC}/NationCodes_clean.txt"
output_path = f'{Temp_file_path_EoC}/filtered_company_ids_v3.txt'
mismatch_output = f'{Temp_file_path_EoC}/filtered_company_NatCoMissmatch_ids.txt'

# Load the ID dataset and the NatCo reference data
df_unique = pd.read_csv(unique_ids_path, sep="|", dtype=str)
df_natco_country = pd.read_csv(natco_path, sep="|", dtype=str)

# Clean column names and values in both DataFrames
for df in [df_unique, df_natco_country]:
    df.columns = df.columns.str.strip().str.replace("\ufeff", "")   # Normalize column names
    df["ID"] = df["ID"].astype(str).str.strip()                     # Ensure clean, string-based IDs

# Merge the two datasets on the ID column using a left join
df_merged = df_unique.merge(df_natco_country, on="ID", how="left")

# Add a column indicating whether ImplNatCo matches NatCo after trimming
df_merged["NatCo_Match"] = df_merged.apply(
    lambda row: "Yes" if str(row.get("ImplNatCo", "")).strip() == str(row.get("NatCo", "")).strip() else "No",
    axis=1
)

# Compute summary counts for matching and non-matching NatCo values
match_count = (df_merged["NatCo_Match"] == "Yes").sum()
mismatch_count = (df_merged["NatCo_Match"] == "No").sum()

# Print summary statistics
print("Merge complete.")
print(f"Matching NatCo values: {match_count}")
print(f"Non-matching NatCo values: {mismatch_count}")
print(f"Total rows checked: {len(df_merged)}")

# Filter rows where the ImplNatCo and NatCo do not match
df_mismatch = df_merged[df_merged["NatCo_Match"] == "No"][["ID", "ImplNatCo", "NatCo"]]

# Save mismatched IDs to a separate file
df_mismatch.to_csv(mismatch_output, sep="|", index=False, encoding="utf-8")
print(f"Mismatched NatCo IDs saved to: {mismatch_output}")

# Save the full merged DataFrame including comparison results
df_merged.to_csv(output_path, sep="|", index=False, encoding="utf-8")
print(f"\nFull merged file saved to: {output_path}")

# Show the first few rows of the merged dataset for review
print(df_merged.head())


Merge complete.
Matching NatCo values: 103962
Non-matching NatCo values: 1
Total rows checked: 103963
Mismatched NatCo IDs saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_company_NatCoMissmatch_ids.txt

Full merged file saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_company_ids_v3.txt
          ID ImplNatCo NatCo NatCo_Match
0  C00948205       009   840          No
1  C02500770       025   025         Yes
2  C02520200       025   025         Yes
3  C02520220       025   025         Yes
4  C02520230       025   025         Yes


#### Drop Unnecessary Columns

In [76]:
# This cell cleans the merged dataset by removing columns related to the
# ImplNatCo–NatCo comparison. It performs a filtering operation that drops
# the columns "ImplNatCo", "NatCo", and "NatCo_Match" if they exist.
# The transformation step produces a simplified DataFrame containing only
# the essential ID fields, which is then written to a new file.

# Paths for the input dataset (v3) and the cleaned output dataset (v4)
unique_ids_file = f'{Temp_file_path_EoC}/filtered_company_ids_v3.txt'
output_file     = f'{Temp_file_path_EoC}/filtered_company_ids_v4.txt'

# Load the dataset containing enriched ID information
unique_ids_v3 = pd.read_csv(unique_ids_file, sep="|", dtype=str, encoding="utf-8", on_bad_lines="skip")

# Columns intended for removal, typically derived fields from earlier processing
cols_to_drop = ["ImplNatCo", "NatCo", "NatCo_Match"]

# Identify which of these columns are actually present in the DataFrame
existing_cols = [c for c in cols_to_drop if c in unique_ids_v3.columns]

# Drop the existing unwanted columns and create a clean copy
ID_clean = unique_ids_v3.drop(columns=existing_cols, errors="ignore").copy()

# Save the cleaned DataFrame as a pipe-separated file
ID_clean.to_csv(output_file, sep="|", index=False, encoding="utf-8")

# Print summary information and preview of the cleaned dataset
print(f"DataFrame saved to: {output_file} (rows: {len(ID_clean):,}, columns dropped: {existing_cols})")
print("=== Head of DF ===")
print(ID_clean.head(10))


DataFrame saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_company_ids_v4.txt (rows: 103,963, columns dropped: ['ImplNatCo', 'NatCo', 'NatCo_Match'])
=== Head of DF ===
          ID
0  C00948205
1  C02500770
2  C02520200
3  C02520220
4  C02520230
5  C02520240
6  C02520250
7  C02520260
8  C02520280
9  C02520290


#### Extract IDs from Datastream to Identify Overlap/Feasible Mapping

In [77]:
# =============================================================================
# CELL SUMMARY
# -----------------------------------------------------------------------------
# This cell extracts all unique IDs found across three cleaned datasets located
# in Temp_file_path_GO:
#   - ID_mapping_clean.txt
#   - MV_clean.txt
#   - TRI_clean.txt
#
# It computes the INTERSECTION (only IDs present in ALL THREE FILES),
# not the union, and writes the result as Mappaple_DS_IDs.txt
# into Temp_file_path_EoC.
# =============================================================================

from pathlib import Path
import pandas as pd

# -----------------------------------------------------------------------------
# 1) Use your existing paths without overwriting them
# -----------------------------------------------------------------------------

# Input files in GO
id_mapping_path = Path(Temp_file_path_GO) / "ID_mapping_clean.txt"
mv_clean_path   = Path(Temp_file_path_GO) / "MV_clean.txt"
tri_clean_path  = Path(Temp_file_path_GO) / "TRI_clean.txt"

# Output file goes to EoC
mappable_ids_path = Path(Temp_file_path_EoC) / "Mappaple_DS_IDs.txt"


# -----------------------------------------------------------------------------
# 2) Helper (RAM-light)
# -----------------------------------------------------------------------------
def get_unique_ids(file_path: Path) -> set:
    if not file_path.exists():
        print(f"Warning: missing file {file_path}")
        return set()
    try:
        df = pd.read_csv(
            file_path,
            sep="|",
            usecols=["ID"],
            dtype=str,
            low_memory=True
        )
        return set(df["ID"].dropna())
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return set()


# -----------------------------------------------------------------------------
# 3) Load ID sets from the three files
# -----------------------------------------------------------------------------
ids_mapping = get_unique_ids(id_mapping_path)
ids_mv      = get_unique_ids(mv_clean_path)
ids_tri     = get_unique_ids(tri_clean_path)

print(f"Unique IDs (mapping): {len(ids_mapping):,}")
print(f"Unique IDs (MV):      {len(ids_mv):,}")
print(f"Unique IDs (TRI):     {len(ids_tri):,}")

# -----------------------------------------------------------------------------
# 4) Compute the INTERSECTION instead of the union
# -----------------------------------------------------------------------------
all_ids = ids_mapping & ids_mv & ids_tri

print(f"\nTotal unique IDs IN ALL THREE files (intersection): {len(all_ids):,}")

# -----------------------------------------------------------------------------
# 5) Save without creating a big DataFrame (RAM friendly)
# -----------------------------------------------------------------------------
with mappable_ids_path.open("w", encoding="utf-8") as f_out:
    f_out.write("ID\n")
    for id_val in all_ids:
        if pd.notna(id_val):
            f_out.write(f"{id_val}\n")

print(f"\nSaved intersection IDs to:\n  {mappable_ids_path}")

print("\nPreview:")
for sample in list(all_ids)[:5]:
    print(" ", sample)


Unique IDs (mapping): 95,104
Unique IDs (MV):      72,079
Unique IDs (TRI):     85,918

Total unique IDs IN ALL THREE files (intersection): 71,972

Saved intersection IDs to:
  /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/Mappaple_DS_IDs.txt

Preview:
  C41009760
  C39237370
  C840R5140
  C344GB500
  C380K8100


#### Remove ADR IDs, Non-Mappable & False SIC IDs and Save

In [78]:
from pathlib import Path
import pandas as pd

# =============================================================================
# CELL SUMMARY
# -----------------------------------------------------------------------------
# This cell filters the company IDs in Temp_file_path_EoC/filtered_company_ids_v4.txt
# using three constraints based on four files in Temp_file_path_EoC:
#
#   1) Keep only IDs that are present in ALL of:
#        - filtered_company_ids_v4.txt   (this file, "source")
#        - Mappaple_DS_IDs.txt           (intersection from the GO files)
#        - SIC_clean.txt                 (intersection from the SIC pipeline)
#
#   2) Remove all ADR IDs found in:
#        - ADR_clean.txt
#
# It also tracks how many unique IDs are removed at each step:
#   - Dropped because they are not in Mappaple_DS_IDs.txt
#   - Dropped because they are not in SIC_clean.txt
#   - Dropped because they are ADRs
#
# The final filtered IDs are saved as Path(Temp_file_path_EoC)/ID_clean.txt.
# =============================================================================

# -----------------------------------------------------------------------------
# 1) Configure folder (EDIT this to your actual EoC path before running)
# -----------------------------------------------------------------------------

# Main input file: in EoC
source_file = Path(Temp_file_path_EoC) / "filtered_company_ids_v4.txt"

# ADR list (to remove): in EoC
adr_file = Path(Temp_file_path_EoC) / "ADR_clean.txt"

# Mapping list from GO files: in EoC
mappable_file = Path(Temp_file_path_EoC) / "Mappaple_DS_IDs.txt"

# SIC-based ID list: in EoC
sic_file = Path(Temp_file_path_EoC) / "SIC_clean.txt"

# Final output: ID_clean.txt in EoC
output_file = Path(Temp_file_path_EoC) / "ID_clean.txt"


# -----------------------------------------------------------------------------
# 2) Load main dataset (filtered_company_ids_v4.txt)
# -----------------------------------------------------------------------------
df = pd.read_csv(
    source_file,
    sep="|",
    dtype=str,
    encoding="utf-8",
    on_bad_lines="skip"
)

source_ids = set(df["ID"].dropna().tolist())

print(f"Rows in source file:                     {len(df):,}")
print(f"Unique IDs in source file:               {len(source_ids):,}")


# -----------------------------------------------------------------------------
# 3) Load ADR IDs (ADR_clean.txt)
# -----------------------------------------------------------------------------
ids_adr = set()
if adr_file.exists():
    with open(adr_file, "r", encoding="utf-8") as f:
        for line in f:
            id_str = line.strip().split("|")[0]
            if id_str:
                ids_adr.add(id_str)
    print(f"Unique ADR IDs loaded:                   {len(ids_adr):,}")
else:
    print(f"Warning: ADR file not found:             {adr_file}")


# -----------------------------------------------------------------------------
# 4) Helper: load ID sets (Mappaple_DS_IDs, SIC_clean)
# -----------------------------------------------------------------------------
def load_id_set(path: Path, label: str) -> set:
    if not path.exists():
        print(f"Warning: {label} not found:              {path}")
        return set()
    try:
        df_ids = pd.read_csv(path, sep="|", usecols=["ID"], dtype=str)
        out = set(df_ids["ID"].dropna().tolist())
        print(f"Unique IDs in {label}:                  {len(out):,}")
        return out
    except Exception as e:
        print(f"Error loading {label} ({path}): {e}")
        return set()

ids_mappable = load_id_set(mappable_file, "Mappaple_DS_IDs")
ids_sic      = load_id_set(sic_file, "SIC_clean")


# -----------------------------------------------------------------------------
# 5) Intersection filtering step 1:
#    Keep only IDs that are in BOTH source and Mappaple_DS_IDs
# -----------------------------------------------------------------------------
ids_in_source_and_mapping = source_ids & ids_mappable
dropped_not_in_mapping = len(source_ids) - len(ids_in_source_and_mapping)

print(f"\nIDs present in source ∩ mapping:         {len(ids_in_source_and_mapping):,}")
print(f"IDs dropped (not in mapping):            {dropped_not_in_mapping:,}")


# -----------------------------------------------------------------------------
# 6) Intersection filtering step 2:
#    From that set, keep only IDs that are also in SIC_clean
# -----------------------------------------------------------------------------
ids_in_all = ids_in_source_and_mapping & ids_sic
dropped_not_in_sic = len(ids_in_source_and_mapping) - len(ids_in_all)

print(f"IDs present in source ∩ mapping ∩ SIC:   {len(ids_in_all):,}")
print(f"IDs dropped (not in SIC_clean):          {dropped_not_in_sic:,}")

df_step = df[df["ID"].isin(ids_in_all)].copy()
print(f"Rows after intersection filtering:       {len(df_step):,}")


# -----------------------------------------------------------------------------
# 7) Remove ADRs
# -----------------------------------------------------------------------------
adr_in_step = ids_in_all & ids_adr
print(f"IDs dropped because ADR:                 {len(adr_in_step):,}")

df_cleaned = df_step[~df_step["ID"].isin(ids_adr)].copy()

final_ids = set(df_cleaned["ID"].dropna().tolist())
print(f"Final unique IDs:                        {len(final_ids):,}")
print(f"Final row count:                         {len(df_cleaned):,}")


# -----------------------------------------------------------------------------
# 8) Save output — overwrite ID_clean.txt in EoC
# -----------------------------------------------------------------------------
df_cleaned.to_csv(output_file, sep="|", index=False, encoding="utf-8")

print(f"\nCleaned ID file saved to:\n  {output_file}")
print("=== Head of df_cleaned ===")
print(df_cleaned.head())


Rows in source file:                     103,963
Unique IDs in source file:               103,963
Unique ADR IDs loaded:                   2,298
Unique IDs in Mappaple_DS_IDs:                  71,972
Unique IDs in SIC_clean:                  77,573

IDs present in source ∩ mapping:         71,933
IDs dropped (not in mapping):            32,030
IDs present in source ∩ mapping ∩ SIC:   55,389
IDs dropped (not in SIC_clean):          16,544
Rows after intersection filtering:       55,389
IDs dropped because ADR:                 22
Final unique IDs:                        55,367
Final row count:                         55,367

Cleaned ID file saved to:
  /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/ID_clean.txt
=== Head of df_cleaned ===
          ID
1  C02500770
2  C02520200
3  C02520220
4  C02520230
5  C02520240


### Currency Code

#### Load Initial Currency Data


In [79]:
"""
This cell loads a currency code file, performs initial cleanup, and prepares data for further processing.
Key operations include:
- Reading a pipe-separated text file into a DataFrame.
- Cleaning and transforming the 'CurrencyCode' column by removing leading characters and trimming whitespace.
- Converting the 'PIT DATE' column into datetime objects to enable downstream filtering or sorting.
- Displaying previews before and after transformations to validate data integrity.
"""

# Construct full file path for the input dataset
file_path = f'{Temp_file_path_EoC}/filtered_CurrencyCode.txt'

# Load the file into a DataFrame, ensuring all columns are read as strings for consistent cleaning
df_currency = pd.read_csv(file_path, sep="|", dtype=str)

# Print confirmation that the file was successfully read
print(f"Successfully loaded data from {file_path}.")

# Display a preview of the raw, uncleaned data
print("\n=== Preview of loaded data ===")
display(df_currency.head())

# Initial Cleaning Section

# Check whether the 'CurrencyCode' column exists before applying string cleaning
if 'CurrencyCode' in df_currency.columns:
    # Convert to string, remove leading single quotes using regex, and trim surrounding whitespace
    df_currency['CurrencyCode'] = (
        df_currency['CurrencyCode']
        .astype(str)
        .str.replace(r"^'", "", regex=True)
        .str.strip()
    )
    print("\nCleaned 'CurrencyCode' column (removed leading single quote and stripped whitespace).")
else:
    # Notify if the expected column is missing
    print("\nWarning: 'CurrencyCode' column not found for cleaning.")

# Convert 'PIT DATE' column to datetime if present
if 'PIT DATE' in df_currency.columns:
    # Convert strings to datetime objects, coercing invalid entries into NaT
    df_currency['PIT DATE_dt'] = pd.to_datetime(
        df_currency['PIT DATE'],
        errors='coerce'
    )
    print("Converted 'PIT DATE' to datetime objects.")
else:
    # Notify when the column is missing
    print("Warning: 'PIT DATE' column not found. Cannot convert to datetime.")

# Display preview after cleaning and conversion steps
print("\n=== Preview after initial cleaning ===")
display(df_currency.head())


Successfully loaded data from /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_CurrencyCode.txt.

=== Preview of loaded data ===


Unnamed: 0,ID,PIT DATE,ItemCode,CurrencyCode
0,C00948205,2021-07-09,56027,'Usd
1,C02500770,1995-12-29,56027,'Ars
2,C0250077A,1999-10-01,56027,'Ars
3,C0250077B,1999-10-01,56027,'Ars
4,C0250077C,1999-10-01,56027,'Ars



Cleaned 'CurrencyCode' column (removed leading single quote and stripped whitespace).
Converted 'PIT DATE' to datetime objects.

=== Preview after initial cleaning ===


Unnamed: 0,ID,PIT DATE,ItemCode,CurrencyCode,PIT DATE_dt
0,C00948205,2021-07-09,56027,Usd,2021-07-09
1,C02500770,1995-12-29,56027,Ars,1995-12-29
2,C0250077A,1999-10-01,56027,Ars,1999-10-01
3,C0250077B,1999-10-01,56027,Ars,1999-10-01
4,C0250077C,1999-10-01,56027,Ars,1999-10-01


#### Sort Currency Data

In [80]:
"""
This cell validates the presence of required data, then sorts the currency DataFrame by two key fields.
Main operations:
- Verifying that the DataFrame and necessary columns ('ID' and 'PIT DATE_dt') exist.
- Sorting the data to ensure chronological ordering within each ID group.
- Creating a sorted copy for downstream transformations or filtering.
- Displaying a preview to confirm correct ordering.
"""

# Check whether the DataFrame and the required columns exist before proceeding
if 'df_currency' in locals() and 'ID' in df_currency.columns and 'PIT DATE_dt' in df_currency.columns:

    # Print status message indicating the start of the sorting process
    print("--- Sorting data by ID and PIT DATE ---")

    # Sort DataFrame first by ID, then by the cleaned datetime column to ensure chronological ordering
    df_currency_sorted = df_currency.sort_values(
        by=['ID', 'PIT DATE_dt'],
        ascending=[True, True]
    ).copy()

    # Confirmation message after sorting completes
    print("\nSuccessfully sorted data.")

    # Display a preview of the sorted DataFrame for validation
    print("\n=== Preview of sorted data ===")
    display(df_currency_sorted.head())

else:
    # Inform the user that the required data or columns are missing
    print("Error: `df_currency` DataFrame or required columns ('ID', 'PIT DATE_dt') not found. Please ensure previous steps were executed successfully.")


--- Sorting data by ID and PIT DATE ---

Successfully sorted data.

=== Preview of sorted data ===


Unnamed: 0,ID,PIT DATE,ItemCode,CurrencyCode,PIT DATE_dt
0,C00948205,2021-07-09,56027,Usd,2021-07-09
1,C02500770,1995-12-29,56027,Ars,1995-12-29
2,C0250077A,1999-10-01,56027,Ars,1999-10-01
3,C0250077B,1999-10-01,56027,Ars,1999-10-01
4,C0250077C,1999-10-01,56027,Ars,1999-10-01


#### Identify Currency Switches

In [81]:
"""
This cell analyzes the sorted currency data to detect IDs whose currency code changes over time.
Key operations:
- Validating that the sorted DataFrame and required columns exist.
- Grouping records by ID to count how many distinct currency codes appear for each entity.
- Identifying IDs that have more than one unique currency code, indicating a currency switch.
- Creating both a DataFrame and a Python list of these IDs for downstream filtering or investigation.
"""

# Ensure required data and columns are available before processing
if 'df_currency_sorted' in locals() and 'ID' in df_currency_sorted.columns and 'CurrencyCode' in df_currency_sorted.columns:
    print("--- Identifying IDs with Currency Switches ---")

    # Group by ID and compute the number of unique currency codes for each ID
    currency_counts = (
        df_currency_sorted.groupby('ID')['CurrencyCode']
        .nunique(dropna=True)  # Count unique non-null currency codes
        .reset_index(name='UniqueCurrencyCount')
    )

    # Select IDs that have more than one distinct currency code
    ids_with_switches_df = currency_counts[currency_counts['UniqueCurrencyCount'] > 1].copy()

    # Print the number of IDs detected with currency changes
    print(f"\nFound {len(ids_with_switches_df):,} IDs with currency switches.")

    # Display a preview of IDs exhibiting currency transitions
    print("\n=== Preview of IDs with Currency Switches ===")
    display(ids_with_switches_df.head())

    # Extract a list of the affected IDs for later use
    ids_with_switches_list = ids_with_switches_df['ID'].tolist()

else:
    # Provide an error message if required data or columns are missing
    print("Error: `df_currency_sorted` DataFrame or required columns ('ID', 'CurrencyCode') not found. Please ensure previous steps were executed successfully.")

    # Create an empty list to avoid errors in subsequent steps
    ids_with_switches_list = []


--- Identifying IDs with Currency Switches ---

Found 7,395 IDs with currency switches.

=== Preview of IDs with Currency Switches ===


Unnamed: 0,ID,UniqueCurrencyCount
10,C02520240,2
19,C02520300,2
22,C02520310,2
26,C02520330,2
4303,C04000090,2


#### Extract Currency Switch Information




In [82]:
# This cell constructs a detailed currency-change history for every ID in the sorted dataset.
# It iterates over all IDs in df_currency_sorted and, within each group, ensures the rows are
# ordered chronologically by PIT DATE_dt. For each ID, it builds an ordered sequence of
# currency "states": it records the first occurrence of a currency and any subsequent points
# in time where the currency changes. The result is a list of dictionaries, where each entry
# contains an ID and the sequence of currency states and their corresponding switch dates.
# This captures both IDs that switch currency and those that remain constant over time.

if 'df_currency_sorted' in locals() and 'ID' in df_currency_sorted.columns and 'CurrencyCode' in df_currency_sorted.columns:
    # Indicate the start of the process for extracting currency information for all IDs
    print("--- Extracting Currency Information for All IDs ---")

    # Work on a copy of the sorted DataFrame so the original remains unchanged
    df_all_ids = df_currency_sorted.copy()

    # Initialize a list to hold the currency-switch sequences for each ID
    currency_info_all_ids = []

    # Iterate over the data grouped by ID to process each entity separately
    for id, group in df_all_ids.groupby('ID'):
        # Ensure that, within each ID, rows are ordered chronologically by PIT DATE_dt
        group_sorted = group.sort_values(by='PIT DATE_dt', ascending=True)

        # Initialize a list to store the ordered sequence of currency states for the current ID
        switch_sequence = []
        # Track the last seen currency to detect changes over time
        last_currency = None

        # Iterate through each row of the sorted group to build the currency-change sequence
        for index, row in group_sorted.iterrows():
            # Current row's currency code
            current_currency = row['CurrencyCode']
            # Current row's PIT DATE as datetime
            current_date = row['PIT DATE_dt']

            # Record the first currency for this ID, and then only record entries when the currency changes
            if current_currency != last_currency:
                switch_sequence.append({'Currency': current_currency, 'SwitchDate': current_date})
                # Update the last seen currency to the current one
                last_currency = current_currency

        # Append the constructed sequence for this ID to the overall list,
        # even if there were no changes (i.e., only one currency throughout)
        currency_info_all_ids.append({'ID': id, 'Switches': switch_sequence})

    # Report how many IDs have been processed and included in the result
    print(f"\nExtracted currency information for {len(currency_info_all_ids):,} IDs (including those without switches).")
    print("\n=== Preview of Extracted Currency Information (including non-switchers) ===")
    # Print a small sample of the resulting structures to inspect the format
    for i, info in enumerate(currency_info_all_ids[:10]):  # Show up to the first 10 IDs
        print(f"ID: {info['ID']}, Info: {info['Switches']}")

else:
    # If the required DataFrame or columns are missing, log a message and initialize an empty result list
    print("Required data (df_currency_sorted) not available. Skipping extraction.")
    currency_info_all_ids = []  # Initialize empty list when prerequisites are not met


--- Extracting Currency Information for All IDs ---

Extracted currency information for 121,007 IDs (including those without switches).

=== Preview of Extracted Currency Information (including non-switchers) ===
ID: C00948205, Info: [{'Currency': 'Usd', 'SwitchDate': Timestamp('2021-07-09 00:00:00')}]
ID: C02500770, Info: [{'Currency': 'Ars', 'SwitchDate': Timestamp('1995-12-29 00:00:00')}]
ID: C0250077A, Info: [{'Currency': 'Ars', 'SwitchDate': Timestamp('1999-10-01 00:00:00')}]
ID: C0250077B, Info: [{'Currency': 'Ars', 'SwitchDate': Timestamp('1999-10-01 00:00:00')}]
ID: C0250077C, Info: [{'Currency': 'Ars', 'SwitchDate': Timestamp('1999-10-01 00:00:00')}]
ID: C02520200, Info: [{'Currency': 'Ars', 'SwitchDate': Timestamp('1996-05-03 00:00:00')}]
ID: C0252020A, Info: [{'Currency': 'Ars', 'SwitchDate': Timestamp('1999-10-01 00:00:00')}]
ID: C0252020B, Info: [{'Currency': 'Ars', 'SwitchDate': Timestamp('1999-10-01 00:00:00')}]
ID: C02520220, Info: [{'Currency': 'Ars', 'SwitchDate': Tim

#### Restructure Currency Switch Data

In [83]:
"""
This cell restructures the extracted currency-switch sequence data for every ID into a wide,
table-friendly DataFrame. Each ID becomes one row. For each possible switch position, the
currency and the date of its first appearance are placed into dedicated columns
(CurrencyCode1, SwitchDate1, CurrencyCode2, SwitchDate2, etc.). IDs with fewer switches
than the maximum are padded with None / NaT. This transformation allows consistent comparison,
filtering, exporting, or merging with other datasets.
"""

# Desired fixed number of switch positions (based on full dataset)
FIXED_MAX_SWITCHES = 3

# Check that the currency_info_all_ids list exists and contains data before processing
if 'currency_info_all_ids' in locals() and currency_info_all_ids:
    print("--- Restructuring Currency Information for All IDs ---")

    # List that will store row-wise dictionaries representing the final table
    restructured_data_all_ids = []

    # Determine how many switch positions exist at most across all IDs
    # and enforce a minimum of FIXED_MAX_SWITCHES
    max_switches = 0
    if currency_info_all_ids:
        detected_max = max(len(item['Switches']) for item in currency_info_all_ids)
        max_switches = max(detected_max, FIXED_MAX_SWITCHES)

    # Iterate over each ID’s switch sequence data
    for item in currency_info_all_ids:
        id = item['ID']
        switches = item['Switches']

        # Initialize row with the ID
        row_data = {'ID': id}

        # Add currency and date columns for each switch position
        for i in range(max_switches):
            if i < len(switches):
                # Populate with actual switch data if available
                row_data[f'CurrencyCode{i+1}'] = switches[i]['Currency']
                row_data[f'SwitchDate{i+1}'] = switches[i]['SwitchDate']
            else:
                # Fill with proper empty values
                row_data[f'CurrencyCode{i+1}'] = None
                row_data[f'SwitchDate{i+1}'] = pd.NaT   # <-- important fix

        # Add the completed row dictionary to the final list
        restructured_data_all_ids.append(row_data)

    # Convert the list of dictionaries into a DataFrame
    df_restructured_all_ids = pd.DataFrame(restructured_data_all_ids)

    print("\nSuccessfully restructured data for all IDs.")
    print("\n=== Preview of Restructured Data (All IDs) ===")
    display(df_restructured_all_ids.head())

else:
    # Fallback when prerequisite data is not available
    print("No currency information found or required data not available. Skipping restructuring.")
    df_restructured_all_ids = pd.DataFrame()


--- Restructuring Currency Information for All IDs ---

Successfully restructured data for all IDs.

=== Preview of Restructured Data (All IDs) ===


Unnamed: 0,ID,CurrencyCode1,SwitchDate1,CurrencyCode2,SwitchDate2,CurrencyCode3,SwitchDate3
0,C00948205,Usd,2021-07-09,,NaT,,NaT
1,C02500770,Ars,1995-12-29,,NaT,,NaT
2,C0250077A,Ars,1999-10-01,,NaT,,NaT
3,C0250077B,Ars,1999-10-01,,NaT,,NaT
4,C0250077C,Ars,1999-10-01,,NaT,,NaT


#### Add CurrentCurrency Column and Finalize Data + Add missing currency (Pat in Macau)

In [84]:
"""
This cell finalizes the restructured currency-switch dataset.
It determines each ID's most recent currency using the CurrencyCode columns
and adds a 'CurrentCurrency' column to the DataFrame. The finalized dataset
is then saved to disk.
"""

output_path_final = f'{Temp_file_path_EoC}/CurrencyCodes_clean.txt'

# Proceed only if the restructured DataFrame exists and is not empty
if 'df_restructured_all_ids' in locals() and not df_restructured_all_ids.empty:
    print("--- Adding 'CurrentCurrency' Column and Finalizing Data (using df_restructured_all_ids) ---")

    # Identify all CurrencyCode columns (e.g., CurrencyCode1, CurrencyCode2, ...)
    currency_code_cols = [
        col for col in df_restructured_all_ids.columns
        if col.startswith('CurrencyCode')
    ]

    if currency_code_cols:

        # Function to determine the last non-null currency for each row
        def get_last_currency(row):
            # Iterate columns in reverse so the first non-null value is the most recent currency
            for col in reversed(currency_code_cols):
                if pd.notna(row[col]):
                    return row[col]
            return None

        # Add the 'CurrentCurrency' column using the helper function
        df_restructured_all_ids['CurrentCurrency'] = (
            df_restructured_all_ids.apply(get_last_currency, axis=1)
        )

        # Make a final copy for clarity
        df_final_currency = df_restructured_all_ids.copy()

        # Display a preview of the finalized data
        print("\nSuccessfully added 'CurrentCurrency' column.")
        print("\n=== Preview of Finalized Data ===")
        display(df_final_currency.head())

        # Save finalized dataset
        df_final_currency.to_csv(output_path_final, sep="|", index=False, encoding="utf-8")
        print(f"\nFinal currency switch data saved to: {output_path_final}")

    else:
        # When no CurrencyCode columns are found, continue with the DataFrame as-is
        print("Warning: No 'CurrencyCode' columns found to process.")
        df_final_currency = df_restructured_all_ids.copy()

        fallback_path = Path(Temp_file_path) / "CurrencyCodes_clean.txt"
        df_final_currency.to_csv(fallback_path, sep="|", index=False, encoding="utf-8")
        print(f"\nFinal currency data (without CurrentCurrency) saved to: {fallback_path}")

else:
    # If the DataFrame is missing or empty, skip processing
    print("DataFrame from previous step (df_restructured_all_ids) not found or is empty. Skipping finalization.")
    df_final_currency = pd.DataFrame()


--- Adding 'CurrentCurrency' Column and Finalizing Data (using df_restructured_all_ids) ---

Successfully added 'CurrentCurrency' column.

=== Preview of Finalized Data ===


Unnamed: 0,ID,CurrencyCode1,SwitchDate1,CurrencyCode2,SwitchDate2,CurrencyCode3,SwitchDate3,CurrentCurrency
0,C00948205,Usd,2021-07-09,,NaT,,NaT,Usd
1,C02500770,Ars,1995-12-29,,NaT,,NaT,Ars
2,C0250077A,Ars,1999-10-01,,NaT,,NaT,Ars
3,C0250077B,Ars,1999-10-01,,NaT,,NaT,Ars
4,C0250077C,Ars,1999-10-01,,NaT,,NaT,Ars



Final currency switch data saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/CurrencyCodes_clean.txt


### FYE

#### Checking "D" Prefix, FYE Month Distribution, and Switches

In [85]:
# Summary:
# This cell loads a filtered financial year-end (FYE) dataset from disk, validates that
# required columns ("FYE" and "ID") are present and non-empty, and reconstructs
# several derived variables from the freshly loaded data so the analysis is independent
# of any previous cell state.
# It then:
# - Cleans and standardizes the FYE column (string conversion and whitespace stripping).
# - Builds an initial error mask that flags missing, empty, and placeholder FYE values
#   (e.g., "na", "null", etc.).
# - Parses the cleaned FYE values (assumed format DYYYYMMDD) into actual dates and
#   derives the corresponding month names, marking invalid or unparsable dates.
# - Aggregates FYE by month to show the distribution of firms across months (absolute
#   counts and percentages).
# - Groups data by firm ID to detect how many firms have more than one unique FYE month,
#   which indicates potential FYE switches over time.
# - Summarizes invalid date formats and the fraction of firms that appear to have
#   changed their FYE month.

# Paths
FILE = f'{Temp_file_path_EoC}/filtered_FYE.txt'  # Construct full path to the filtered FYE file
VALUE_COL = "FYE"  # Name of the column that stores financial year-end information

# Load the file and perform initial checks to ensure necessary data is available
# This reloads the data into fye_df, making it independent of prior cell execution state
try:
    # Read the FYE file as a pipe-separated text file, keeping all values as strings
    # and preserving NA-like values instead of automatically converting them
    fye_df = pd.read_csv(FILE, sep="|", encoding="utf-8", dtype=str, keep_default_na=True)

    # Check that the dataframe is non-empty and contains the required columns
    if fye_df.empty or VALUE_COL not in fye_df.columns or 'ID' not in fye_df.columns:
         raise ValueError(f"Could not load data or missing required columns in {FILE}")

    # Recompute the FYE value series as a string-based Series
    val = fye_df[VALUE_COL].astype("string")

    # Strip leading and trailing whitespace from FYE values to standardize formatting
    val_stripped = val.str.strip()

    # Build the initial error mask, flagging rows where FYE is missing, empty,
    # or contains placeholder values indicating missingness or non-applicability
    error_mask_initial = val.isna() | (val_stripped == "") | val_stripped.str.lower().isin(
        {"na", "nan", "null", "none", "n/a", "#n/a", "n"}
    )

    # Compute the number of unique firm IDs to use later when calculating percentages
    unique_ids = fye_df["ID"].nunique()

# Handle file not found or validation errors
except (FileNotFoundError, ValueError) as e:
    print(f"Error loading or processing data: {e}")
    # Re-raise the exception to stop execution of this cell if loading fails
    raise  # Re-raise the exception to stop execution

# Proceed with additional analysis after successful data loading and validation
print("\n--- Additional FYE Analysis ---")

# 1. Check if all FYE values start with "D"
# This verifies a structural expectation on the FYE format (e.g., "DYYYYMMDD")
starts_with_D = val_stripped.str.startswith("D").all()
print(f'\nDoes the "{VALUE_COL}" column start with "D" for all non-empty rows? {starts_with_D}')

# Extract month from FYE (assuming YYYYMMDD format after removing the leading 'D')
# First remove the 'D' prefix, then parse the remaining string as a date
fye_dates = pd.to_datetime(val_stripped.str.replace('D', '', regex=False), format='%Y%m%d', errors='coerce')

# Convert parsed dates to month names; mark invalid or missing dates with a placeholder label
fye_months = fye_dates.dt.month_name().fillna("Invalid/Missing FYE Date")

# Identify rows where the date parsing failed but which are not already flagged as initial errors
invalid_date_mask = fye_dates.isna() & ~error_mask_initial  # Exclude those already captured by the initial error mask

# 2.i) How many firms have their FYE in which month (absolute and in %)
# Count how often each month label occurs across all rows
fye_month_counts = fye_months.value_counts().reset_index()

# Rename the columns to meaningful labels for reporting
fye_month_counts.columns = ["FYE Month", "Count"]

# Compute the percentage share of each month relative to all rows
fye_month_counts["Percentage"] = (fye_month_counts["Count"] / len(fye_months)) * 100

# Sort months by frequency in descending order to highlight the most common FYE months
fye_month_counts = fye_month_counts.sort_values(by="Count", ascending=False)

print(f"\nCompany count by {VALUE_COL} Month:")
display(fye_month_counts)

# 2.ii) How many firms switched their FYE (unique ID-Month combinations per company)
# Construct a temporary DataFrame that pairs each row's ID with its derived FYE month
id_month_counts = (
    pd.DataFrame({'ID': fye_df['ID'], 'FYE_Month': fye_months})
    # Group by firm ID and count how many distinct FYE months each firm has
    .groupby("ID")["FYE_Month"]
    .nunique(dropna=True)  # Count unique non-NaN months per ID
    .reset_index(name="UniqueFYEMonths")  # Store the count in a new column
)

# Filter to firms that have more than one unique FYE month, indicating a potential FYE switch
ids_with_multiple_fye_months = id_month_counts[id_month_counts["UniqueFYEMonths"] > 1]

# Count how many firms exhibit FYE switching behavior
num_multiple_fye_months_ids = len(ids_with_multiple_fye_months)

# Compute the percentage of such firms relative to all unique IDs
percent_multiple_fye_months_ids = (num_multiple_fye_months_ids / unique_ids) * 100 if unique_ids > 0 else 0

print(f"\nIDs with >1 unique {VALUE_COL} month (indicating a potential switch): {num_multiple_fye_months_ids} ({percent_multiple_fye_months_ids:.2f}%)")

# Show a sample of IDs with multiple FYE months, if any exist, for manual inspection
if num_multiple_fye_months_ids > 0:
    print(f"Example IDs with multiple {VALUE_COL} months:")
    display(ids_with_multiple_fye_months.head())

# --- Summary overview of additional analysis ---
print("\nAdditional Analysis Summary:")

# Report the number of rows with invalid date formats in the FYE column (excluding initially flagged errors)
print(f'Rows with invalid date format in "{VALUE_COL}": {invalid_date_mask.sum()}')

# Report how many firms appear to have changed their FYE month and the corresponding percentage
print(f"IDs with >1 unique FYE month: {num_multiple_fye_months_ids} ({percent_multiple_fye_months_ids:.2f}%)")



--- Additional FYE Analysis ---

Does the "FYE" column start with "D" for all non-empty rows? False

Company count by FYE Month:


Unnamed: 0,FYE Month,Count,Percentage
0,December,2540373,35.468003
1,June,1406872,19.642367
2,March,1382692,19.304773
3,September,1131818,15.802138
4,Invalid/Missing FYE Date,154599,2.15847
5,January,74396,1.038697
6,April,73159,1.021426
7,February,71803,1.002494
8,October,70763,0.987974
9,July,69398,0.968916



IDs with >1 unique FYE month (indicating a potential switch): 111891 (92.51%)
Example IDs with multiple FYE months:


Unnamed: 0,ID,UniqueFYEMonths
0,C00948205,4
1,C02500770,2
2,C0250077A,2
3,C0250077B,2
4,C0250077C,2



Additional Analysis Summary:
Rows with invalid date format in "FYE": 154599
IDs with >1 unique FYE month: 111891 (92.51%)


#### Displaying Rows with Invalid FYE Date Format

In [86]:
# Summary:
# This cell extracts and displays all rows whose FYE values could not be parsed into
# valid dates, based on the `invalid_date_mask` produced earlier. It ensures that the
# required variables (`fye_df` and `invalid_date_mask`) exist, initializes an empty
# fallback DataFrame, and then filters the original dataset to produce a subset
# containing only invalid FYE date-format rows. If such rows exist, the cell prints
# their count and shows the first records for inspection; otherwise, it prints that
# none were found.

# Initialize invalid_fye_format_rows as an empty DataFrame in case dependencies are missing
invalid_fye_format_rows = pd.DataFrame(columns=fye_df.columns if 'fye_df' in locals() else [])

# Check that required variables exist before proceeding
if 'fye_df' in locals() and 'invalid_date_mask' in locals():

    # Apply the mask to filter rows where the parsed FYE date was invalid
    invalid_fye_format_rows = fye_df[invalid_date_mask]

    print("\n=== Rows with Invalid Date Format in FYE ===")

    # If invalid rows exist, print their count and show examples
    if not invalid_fye_format_rows.empty:
        print(f"Number of rows with invalid date format: {len(invalid_fye_format_rows)}")
        display(invalid_fye_format_rows.head())  # Show first few rows for inspection

    # If no invalid rows were found, inform the user
    else:
        print("No rows found with invalid date format.")

# If prerequisites for analysis are missing, notify the user
else:
    print("Required data not found. Ensure the FYE summary analysis has been executed first.")



=== Rows with Invalid Date Format in FYE ===
Number of rows with invalid date format: 154599


Unnamed: 0,ID,PIT DATE,ItemCode,FYE
129,C02520200,2009-09-29,5350,Nd
130,C02520200,2009-09-29,5350,Nd
131,C02520200,2009-09-29,5350,Nd
132,C02520200,2009-09-29,5350,Nd
195,C0252020A,2009-09-29,5350,Nd


#### Filtering and Analyzing FYE Data

In [87]:
# Summary:
# This cell performs a complete filtering, cleaning, and restructuring of FYE data.
# Key filtering steps include:
# - Removing rows where the FYE value equals "Nd".
# - Stripping whitespace, standardizing to string, and removing a leading "D".
# - Parsing cleaned FYE strings as dates in YYYYMMDD format.
# - Parsing PIT DATE in YYYY-MM-DD format.
# - Dropping rows where either parsed date is invalid.
# - Grouping by (ID, FYE_Year) and selecting the entry with the latest PIT DATE.
# Key transformations include:
# - Deriving the FYE_Year and FYE_Month.
# - Computing distributions of FYE months per year.
# - Saving a subset of cleaned columns to a pipe-separated file.

import pandas as pd

# --- Paths ---
output_path_v2 = f'{Temp_file_path_EoC}/filtered_FYE_v2.txt'  # Output file for final cleaned data

# --- Main Logic ---
if 'fye_df' in locals() and VALUE_COL in fye_df.columns:

    # 1) Filter out rows where FYE equals "Nd"
    initial_rows = len(fye_df)  # Track dataset size before filtering
    fye_df_filtered = (
        fye_df[
            fye_df[VALUE_COL]
            .astype(str)            # Ensure string type
            .str.strip()            # Remove whitespace
            .str.lower() != 'nd'    # Exclude placeholder value "nd"
        ].copy()
    )
    rows_after_filtering_nd = len(fye_df_filtered)  # Size after filtering

    print(f"\n--- After filtering 'Nd' from {VALUE_COL} column ---")
    print(f"Initial rows: {initial_rows:,}")
    print(f"Rows after filtering 'Nd': {rows_after_filtering_nd:,}")
    try:
        display(fye_df_filtered.head())  # Preview filtered data
    except NameError:
        print(fye_df_filtered.head())

    # 2) Clean FYE by removing a leading "D" if present
    fye_df_filtered['FYE_cleaned'] = (
        fye_df_filtered[VALUE_COL]
        .astype(str)                # Ensure string type
        .str.strip()                # Remove whitespace
        .str.replace(r'^D', '', regex=True)  # Remove leading "D"
    )

    # 3) Parse FYE_cleaned as datetime (YYYYMMDD)
    fye_df_filtered['FYE_dt'] = pd.to_datetime(
        fye_df_filtered['FYE_cleaned'],
        format='%Y%m%d',
        errors='coerce'             # Invalid formats become NaT
    )

    # Parse PIT DATE column (YYYY-MM-DD)
    if 'PIT DATE' in fye_df_filtered.columns:
        fye_df_filtered['PIT_DATE_dt'] = pd.to_datetime(
            fye_df_filtered['PIT DATE'],
            format='%Y-%m-%d',
            errors='coerce'         # Invalid formats become NaT
        )
    else:
        print("\nError: 'PIT DATE' column not found in fye_df.")
        raise KeyError("Missing 'PIT DATE' column")

    # 4) Drop rows where parsed dates are invalid
    initial_filtered_rows = len(fye_df_filtered)  # Count before date-cleaning
    fye_df_cleaned_dates = (
        fye_df_filtered
        .dropna(subset=['FYE_dt', 'PIT_DATE_dt'])  # Require both valid dates
        .copy()
    )
    rows_after_date_parsing = len(fye_df_cleaned_dates)  # Count after dropping invalids

    print(f"\n--- After cleaning and parsing dates ---")
    print(f"Rows before parsing: {initial_filtered_rows:,}")
    print(f"Rows after parsing and dropping NaT: {rows_after_date_parsing:,}")
    try:
        display(fye_df_cleaned_dates.head())  # Preview valid rows
    except NameError:
        print(fye_df_cleaned_dates.head())

    # 5) Extract FYE_Year and select latest PIT DATE per (ID, FYE_Year)
    fye_df_cleaned_dates['FYE_Year'] = fye_df_cleaned_dates['FYE_dt'].dt.year  # Extract year

    # Sort to ensure tail(1) selects the latest PIT DATE
    fye_df_sorted = fye_df_cleaned_dates.sort_values(
        by=['ID', 'FYE_Year', 'PIT_DATE_dt'],
        ascending=[True, True, True]
    )

    # Group by ID and FYE_Year, keeping only the latest entry
    fye_df_latest_per_year = (
        fye_df_sorted
        .groupby(['ID', 'FYE_Year'], as_index=False)
        .tail(1)
        .reset_index(drop=True)
        .copy()
    )
    rows_after_latest_selection = len(fye_df_latest_per_year)

    print(f"\n--- After selecting latest entry per ID and FYE Year ---")
    print(f"Rows before selection: {rows_after_date_parsing:,}")
    print(f"Rows after selection: {rows_after_latest_selection:,}")
    try:
        display(fye_df_latest_per_year.head())  # Preview final firm-year dataset
    except NameError:
        print(fye_df_latest_per_year.head())

    # 6) Add FYE_Month from the parsed FYE date
    fye_df_latest_per_year['FYE_Month'] = (
        fye_df_latest_per_year['FYE_dt'].dt.month_name()
    )

    print("\n=== Preview of data with FYE Month and Year ===")
    try:
        display(fye_df_latest_per_year.head())
    except NameError:
        print(fye_df_latest_per_year.head())

    # 7) Compute distribution of FYE months per year
    fye_monthly_distribution_per_year = (
        fye_df_latest_per_year
        .groupby(['FYE_Year', 'FYE_Month'])
        .size()
        .reset_index(name='Count')
    )

    # Define month ordering for sorted output
    month_order = [
        'January', 'February', 'March', 'April', 'May', 'June',
        'July', 'August', 'September', 'October', 'November', 'December'
    ]

    # Enforce categorical ordering
    fye_monthly_distribution_per_year['FYE_Month'] = pd.Categorical(
        fye_monthly_distribution_per_year['FYE_Month'],
        categories=month_order,
        ordered=True
    )

    # Sort by year, then month
    fye_monthly_distribution_per_year = (
        fye_monthly_distribution_per_year
        .sort_values(['FYE_Year', 'FYE_Month'])
        .reset_index(drop=True)
    )

    print("\n--- FYE Month Distribution per Year ---")
    try:
        display(fye_monthly_distribution_per_year)
    except NameError:
        print(fye_monthly_distribution_per_year)

    # 8) Check: do remaining original (non-empty) FYE values start with "D"?
    col = fye_df_filtered[VALUE_COL].astype(str).str.strip()  # Standardize formatting
    non_empty = col[col != ""]                               # Exclude empty strings
    starts_with_D_filtered = non_empty.str.startswith("D").all()  # Check prefix condition

    print(
        f'\nDoes the original "{VALUE_COL}" column in the filtered data '
        f'start with "D" for all non-empty rows? {starts_with_D_filtered}'
    )

    # 9) Save selected subset with original FYE values
    output_cols = ['ID', 'PIT DATE', 'ItemCode', VALUE_COL]  # Columns to export
    missing_cols = [c for c in output_cols if c not in fye_df_latest_per_year.columns]

    if missing_cols:
        print(f"\nMissing columns in final DataFrame, cannot save: {missing_cols}")
    else:
        fye_df_latest_per_year[output_cols].to_csv(
            output_path_v2,
            sep="|",
            index=False,
            encoding="utf-8"
        )
        print(
            f"\nFiltered FYE data saved to: {output_path_v2} "
            f"(rows: {len(fye_df_latest_per_year):,})"
        )

    # 10) Summary of cleaning steps applied
    print("\n--- Task Summary ---")
    print(f"- Filtered out rows where '{VALUE_COL}' was 'Nd'.")
    print("- Parsed FYE into datetime format.")
    print("- Parsed PIT DATE into datetime format.")
    print("- Selected latest PIT DATE per (ID, FYE_Year).")
    print("- Derived FYE month distribution.")
    print("- Verified whether original FYE values start with 'D'.")
    print(f"- Saved cleaned subset to {output_path_v2}.")

else:
    print(
        f"Error: 'fye_df' DataFrame or '{VALUE_COL}' column not found. "
        f"Please ensure the loading/preparation cell was executed."
    )



--- After filtering 'Nd' from FYE column ---
Initial rows: 7,162,436
Rows after filtering 'Nd': 7,007,837


Unnamed: 0,ID,PIT DATE,ItemCode,FYE
0,C00948205,2021-07-09,5350,D20181231
1,C00948205,2021-07-09,5350,D20191231
2,C00948205,2021-07-09,5350,D20201231
3,C00948205,2021-07-09,5350,D20190930
4,C00948205,2021-07-09,5350,D20191231



--- After cleaning and parsing dates ---
Rows before parsing: 7,007,837
Rows after parsing and dropping NaT: 7,007,837


Unnamed: 0,ID,PIT DATE,ItemCode,FYE,FYE_cleaned,FYE_dt,PIT_DATE_dt
0,C00948205,2021-07-09,5350,D20181231,20181231,2018-12-31,2021-07-09
1,C00948205,2021-07-09,5350,D20191231,20191231,2019-12-31,2021-07-09
2,C00948205,2021-07-09,5350,D20201231,20201231,2020-12-31,2021-07-09
3,C00948205,2021-07-09,5350,D20190930,20190930,2019-09-30,2021-07-09
4,C00948205,2021-07-09,5350,D20191231,20191231,2019-12-31,2021-07-09



--- After selecting latest entry per ID and FYE Year ---
Rows before selection: 7,007,837
Rows after selection: 1,842,027


Unnamed: 0,ID,PIT DATE,ItemCode,FYE,FYE_cleaned,FYE_dt,PIT_DATE_dt,FYE_Year
0,C00948205,2021-07-09,5350,D20181231,20181231,2018-12-31,2021-07-09,2018
1,C00948205,2021-07-09,5350,D20191231,20191231,2019-12-31,2021-07-09,2019
2,C00948205,2021-07-28,5350,D20200331,20200331,2020-03-31,2021-07-28,2020
3,C00948205,2022-04-06,5350,D20211231,20211231,2021-12-31,2022-04-06,2021
4,C00948205,2023-03-21,5350,D20221231,20221231,2022-12-31,2023-03-21,2022



=== Preview of data with FYE Month and Year ===


Unnamed: 0,ID,PIT DATE,ItemCode,FYE,FYE_cleaned,FYE_dt,PIT_DATE_dt,FYE_Year,FYE_Month
0,C00948205,2021-07-09,5350,D20181231,20181231,2018-12-31,2021-07-09,2018,December
1,C00948205,2021-07-09,5350,D20191231,20191231,2019-12-31,2021-07-09,2019,December
2,C00948205,2021-07-28,5350,D20200331,20200331,2020-03-31,2021-07-28,2020,March
3,C00948205,2022-04-06,5350,D20211231,20211231,2021-12-31,2022-04-06,2021,December
4,C00948205,2023-03-21,5350,D20221231,20221231,2022-12-31,2023-03-21,2022,December



--- FYE Month Distribution per Year ---


Unnamed: 0,FYE_Year,FYE_Month,Count
0,1973,September,1
1,1974,September,1
2,1975,February,1
3,1975,July,1
4,1975,September,1
...,...,...,...
588,2024,September,39819
589,2024,October,1127
590,2024,November,1077
591,2024,December,3474



Does the original "FYE" column in the filtered data start with "D" for all non-empty rows? True

Filtered FYE data saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_FYE_v2.txt (rows: 1,842,027)

--- Task Summary ---
- Filtered out rows where 'FYE' was 'Nd'.
- Parsed FYE into datetime format.
- Parsed PIT DATE into datetime format.
- Selected latest PIT DATE per (ID, FYE_Year).
- Derived FYE month distribution.
- Verified whether original FYE values start with 'D'.
- Saved cleaned subset to /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_FYE_v2.txt.


#### Pivoted FYE Month Distribution Table

In [88]:
# Summary:
# This cell reshapes the FYE month distribution data into a pivot table where months
# form the rows and years form the columns, with values representing counts of firms.
# Missing month-year combinations are filled with zero, and a total row is added.
# The resulting pivot table is displayed and exported to an Excel file.

output_path_pivot_excel = f'{Temp_file_path_EoC}/Overview-Company_per_year_FYE_per_month_per_year.xlsx'  # Output file path

# Create a pivot table with months as rows, years as columns, and counts as values
fye_monthly_distribution_pivot = fye_monthly_distribution_per_year.pivot(
    index='FYE_Month',     # Row axis: month names
    columns='FYE_Year',    # Column axis: years
    values='Count'         # Cell values: number of entries per month-year
).fillna(0)                # Replace missing combinations with 0

# Add a summary row that sums counts across all years for each month
fye_monthly_distribution_pivot.loc['Sum'] = fye_monthly_distribution_pivot.sum()

print("\n--- Pivoted FYE Month Distribution (Years as Columns, Months as Rows) ---")
display(fye_monthly_distribution_pivot)  # Show the pivot table

# Save the pivot table to an Excel file
fye_monthly_distribution_pivot.to_excel(output_path_pivot_excel)

print(f"\nPivoted FYE month distribution table saved to: {output_path_pivot_excel}")



--- Pivoted FYE Month Distribution (Years as Columns, Months as Rows) ---


FYE_Year,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
FYE_Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
January,0.0,0.0,0.0,0.0,8.0,15.0,34.0,79.0,179.0,250.0,...,173.0,119.0,133.0,128.0,100.0,101.0,102.0,82.0,37.0,1.0
February,0.0,0.0,1.0,2.0,7.0,10.0,13.0,91.0,94.0,123.0,...,132.0,149.0,156.0,246.0,120.0,81.0,103.0,101.0,45.0,0.0
March,0.0,0.0,0.0,5.0,24.0,41.0,55.0,536.0,560.0,1118.0,...,4057.0,4575.0,4871.0,4270.0,3896.0,3699.0,3590.0,3445.0,887.0,0.0
April,0.0,0.0,0.0,0.0,5.0,6.0,10.0,93.0,95.0,227.0,...,187.0,160.0,159.0,116.0,103.0,111.0,112.0,113.0,90.0,0.0
May,0.0,0.0,0.0,5.0,7.0,9.0,16.0,82.0,90.0,160.0,...,112.0,109.0,185.0,100.0,110.0,107.0,89.0,83.0,103.0,0.0
June,0.0,0.0,0.0,8.0,23.0,42.0,64.0,311.0,370.0,624.0,...,2928.0,2810.0,3195.0,2925.0,3021.0,2892.0,2842.0,3255.0,10125.0,0.0
July,0.0,0.0,1.0,5.0,11.0,19.0,23.0,75.0,86.0,167.0,...,290.0,310.0,295.0,221.0,240.0,274.0,272.0,259.0,298.0,0.0
August,0.0,0.0,0.0,7.0,9.0,16.0,23.0,88.0,103.0,154.0,...,344.0,325.0,422.0,333.0,335.0,329.0,287.0,282.0,479.0,0.0
September,1.0,1.0,1.0,9.0,27.0,44.0,57.0,287.0,316.0,568.0,...,3860.0,4639.0,4571.0,4290.0,3839.0,3678.0,4250.0,5067.0,39819.0,0.0
October,0.0,0.0,0.0,2.0,7.0,12.0,16.0,113.0,145.0,251.0,...,1331.0,1292.0,1129.0,1236.0,1399.0,1544.0,1478.0,1328.0,1127.0,0.0



Pivoted FYE month distribution table saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/Overview-Company_per_year_FYE_per_month_per_year.xlsx


#### Check for Multiple FYE Entries per Company per Year

In [89]:
# Summary:
# This cell verifies that the dataset containing the latest PIT DATE per (ID, FYE_Year)
# truly has only one row per company per year. It groups the data by ID and FYE_Year,
# counts the number of rows in each group, and checks whether any company appears
# more than once in a given year. If duplicates are found, examples are shown.

# Proceed only if the final per-year dataset exists
if 'fye_df_latest_per_year' in locals():

    # Group by company ID and FYE year, counting how many entries exist per group
    id_year_counts = (
        fye_df_latest_per_year
        .groupby(['ID', 'FYE_Year'])
        .size()
        .reset_index(name='Count')
    )

    # Identify groups where a company appears more than once in the same year
    multiple_entries = id_year_counts[id_year_counts['Count'] > 1]

    print("\n--- Check for multiple entries per ID per FYE Year ---")

    # Report whether duplicates exist
    if multiple_entries.empty:
        print("Each company (ID) appears only once per FYE Year in the fye_df_latest_per_year DataFrame.")
    else:
        print("Found IDs with multiple entries for the same FYE Year in the fye_df_latest_per_year DataFrame.")
        print("Examples of IDs with multiple entries:")
        display(multiple_entries.head())

# Handle missing dataset case
else:
    print("Error: fye_df_latest_per_year DataFrame not found. Ensure previous processing steps were executed.")



--- Check for multiple entries per ID per FYE Year ---
Each company (ID) appears only once per FYE Year in the fye_df_latest_per_year DataFrame.


#### Further Processing and Cleaning of Filtered FYE Data

In [90]:
# Summary:
# This cell loads a pre-filtered FYE dataset from disk, performs column-level cleanup,
# derives year and date/month information from the FYE variable, and outputs two cleaned files.
# Concretely, it:
# - Reads the input file containing FYE data into a DataFrame.
# - Optionally drops the "ItemCode" and "PIT DATE" columns if they are present.
# - Cleans and transforms the "FYE" column by removing a leading "D", parsing the
#   remaining value as a date, and extracting:
#     * the corresponding fiscal year (FY)
#     * the month name ("FYE Month") for FYE_clean
#     * the full date ("FYE Date") for FYED_clean
# - Removes the original "FYE" column once derived fields are created.
# - Sorts the resulting data by company identifier ("ID") and fiscal year ("FY")
#   for a chronological view per company.
# - Saves:
#     * FYE_clean.txt  – with FY and FYE Month
#     * FYED_clean.txt – with FY and FYE Date
# - Handles missing files and unexpected errors via try/except.

# Paths
input_path = f'{Temp_file_path_EoC}/filtered_FYE_v2.txt'   # Path to the pre-filtered FYE input file
output_path = f'{Temp_file_path_EoC}/FYE_clean.txt'        # Path where the cleaned FYE (month) output will be saved
output_path_fyed = f'{Temp_file_path_EoC}/FYED_clean.txt'  # Path where the cleaned FYE (full date) output will be saved

try:
    # Load the data from the input file into a DataFrame
    # All columns are read as strings to preserve formatting and avoid unintended type coercion
    df_fye_v2 = pd.read_csv(input_path, sep="|", dtype=str, encoding="utf-8")

    # Print basic information about the loaded dataset, including row count
    print(f"Successfully loaded data from {input_path}. Initial rows: {len(df_fye_v2):,}")
    print("\n=== Preview of loaded data ===")
    display(df_fye_v2.head())

    # Define which columns should be dropped from the dataset
    cols_to_drop = ["ItemCode", "PIT DATE"]

    # Identify which of the specified columns actually exist in the loaded DataFrame
    existing_cols_to_drop = [col for col in cols_to_drop if col in df_fye_v2.columns]

    # Drop only those columns that are present, and create a working copy of the DataFrame
    if existing_cols_to_drop:
        df_fye_base = df_fye_v2.drop(columns=existing_cols_to_drop, errors="ignore").copy()
        print(f"\nDropped columns: {existing_cols_to_drop}")
    else:
        # If none of the specified columns exist, simply continue with a copy of the original DataFrame
        df_fye_base = df_fye_v2.copy()
        print("\nNo columns to drop found among the specified list.")

    # Initialize the two output DataFrames to None
    df_fye_processed = None       # For FYE_clean (with FY + FYE Month)
    df_fyed_processed = None      # For FYED_clean (with FY + FYE Date)

    # Extract year, month, and full date information from the FYE column, if it exists
    if "FYE" in df_fye_base.columns:
        # Clean the 'FYE' column by:
        # - Converting to string
        # - Removing a leading 'D' (if present)
        # - Stripping surrounding whitespace
        # Then parse the resulting string as a date with format YYYYMMDD
        fye_dates = pd.to_datetime(
            df_fye_base["FYE"]
            .astype(str)
            .str.replace('D', '', regex=False)
            .str.strip(),
            format='%Y%m%d',
            errors='coerce'
        )

        # 1) DataFrame for FYE_clean: FY + FYE Month
        df_fye_processed = df_fye_base.copy()
        df_fye_processed["FY"] = fye_dates.dt.year
        df_fye_processed["FYE Month"] = fye_dates.dt.month_name().fillna("Invalid/Missing FYE Date")
        df_fye_processed = df_fye_processed.drop(columns=["FYE"], errors="ignore")

        # 2) DataFrame for FYED_clean: FY + full FYE Date
        df_fyed_processed = df_fye_base.copy()
        df_fyed_processed["FY"] = fye_dates.dt.year
        # Store the full date as a string YYYY-MM-DD; invalid dates become NaN
        df_fyed_processed["FYE Date"] = fye_dates.dt.strftime('%Y-%m-%d')
        df_fyed_processed = df_fyed_processed.drop(columns=["FYE"], errors="ignore")

        print("\nExtracted Year ('FY') and created:")
        print(" - 'FYE Month' for FYE_clean")
        print(" - 'FYE Date' for FYED_clean")
        print("Dropped original 'FYE' column in both outputs.")

    else:
        # If 'FYE' is not available, the date-based transformation cannot be performed
        print("\nWarning: 'FYE' column not found. Cannot extract year, month, or date.")
        # In this case, both processed DataFrames are simply the base DataFrame copies
        df_fye_processed = df_fye_base.copy()
        df_fyed_processed = df_fye_base.copy()

    # Sort both processed DataFrames by company ID and fiscal year if both columns are present
    if "ID" in df_fye_processed.columns and "FY" in df_fye_processed.columns:
        df_fye_processed = df_fye_processed.sort_values(
            by=["ID", "FY"],
            ascending=[True, True],
            kind="mergesort"  # Stable sort to preserve existing order within groups
        ).reset_index(drop=True)

        df_fyed_processed = df_fyed_processed.sort_values(
            by=["ID", "FY"],
            ascending=[True, True],
            kind="mergesort"
        ).reset_index(drop=True)

        print("\nSorted both FYE_clean and FYED_clean DataFrames by ID and FY.")
    else:
        print("\nWarning: 'ID' or 'FY' column not found. Cannot sort DataFrames.")

    # Show a preview of the transformed DataFrames after cleaning and sorting
    print("\n=== Preview of processed data for FYE_clean (FY + FYE Month) ===")
    display(df_fye_processed.head())

    print("\n=== Preview of processed data for FYED_clean (FY + FYE Date) ===")
    display(df_fyed_processed.head())

    # Save the processed DataFrames to the specified output files in pipe-separated format
    df_fye_processed.to_csv(output_path, sep="|", index=False, encoding="utf-8")
    print(f"\nProcessed data with FY and FYE Month saved to: {output_path}")

    df_fyed_processed.to_csv(output_path_fyed, sep="|", index=False, encoding="utf-8")
    print(f"Processed data with FY and full FYE Date saved to: {output_path_fyed}")

# Handle the case where the input file is missing
except FileNotFoundError:
    print(f"Error: Input file not found at {input_path}. Please ensure 'filtered_FYE_v2.txt' was created in the previous steps.")

# Handle any other unexpected errors during processing
except Exception as e:
    print(f"An error occurred during processing: {e}")


Successfully loaded data from /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_FYE_v2.txt. Initial rows: 1,842,027

=== Preview of loaded data ===


Unnamed: 0,ID,PIT DATE,ItemCode,FYE
0,C00948205,2021-07-09,5350,D20181231
1,C00948205,2021-07-09,5350,D20191231
2,C00948205,2021-07-28,5350,D20200331
3,C00948205,2022-04-06,5350,D20211231
4,C00948205,2023-03-21,5350,D20221231



Dropped columns: ['ItemCode', 'PIT DATE']

Extracted Year ('FY') and created:
 - 'FYE Month' for FYE_clean
 - 'FYE Date' for FYED_clean
Dropped original 'FYE' column in both outputs.

Sorted both FYE_clean and FYED_clean DataFrames by ID and FY.

=== Preview of processed data for FYE_clean (FY + FYE Month) ===


Unnamed: 0,ID,FY,FYE Month
0,C00948205,2018,December
1,C00948205,2019,December
2,C00948205,2020,March
3,C00948205,2021,December
4,C00948205,2022,December



=== Preview of processed data for FYED_clean (FY + FYE Date) ===


Unnamed: 0,ID,FY,FYE Date
0,C00948205,2018,2018-12-31
1,C00948205,2019,2019-12-31
2,C00948205,2020,2020-03-31
3,C00948205,2021,2021-12-31
4,C00948205,2022,2022-12-31



Processed data with FY and FYE Month saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/FYE_clean.txt
Processed data with FY and full FYE Date saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/FYED_clean.txt


### Update Code

#### Import

In [91]:
# Summary:
# This cell loads an Update Code dataset from a pipe-separated file, validates that the
# expected "Value" column is present, and performs several cleaning and transformation steps.
# Specifically, it:
# - Reads the dataset with string-typed columns and confirms the header row structure.
# - Identifies and counts missing or invalid entries in the "Value" column, treating NaN,
#   empty strings, and specific placeholder values ("Ns", "N") as missing.
# - Cleans the "Value" column by removing a leading "S" (if present) and trimming whitespace.
# - Renames the cleaned "Value" column to "UpdateCode" to reflect its semantic meaning.
# - Saves the transformed dataset to a new file, preserving all other columns.
# The cell includes basic error handling for missing input files and unexpected issues.

# Paths
input_path = f'{Temp_file_path_EoC}/filtered_UpdateCode.txt'  # Input file containing raw Update Code data
output_path_v2 = f'{Temp_file_path_EoC}/filtered_UpdateCode_v2.txt'  # Output path for the cleaned and renamed data

# The column containing Update Code, will be renamed to "UpdateCode"
VALUE_COL_ORIGINAL = "Value"       # Name of the original column containing the raw update codes
VALUE_COL_RENAMED = "UpdateCode"   # Desired name for the cleaned update code column

try:
    # Load the data, expecting the first row to be the header
    df_updatecode = pd.read_csv(
        input_path,
        sep="|",           # Use pipe as the column separator
        encoding="utf-8",  # Assume UTF-8 encoding for the text file
        dtype=str,         # Read all columns as strings to avoid unintended type inference
        header=0,          # Explicitly specify that the first row contains column names
        keep_default_na=True  # Let pandas assign NaN to recognized missing value patterns
    )

    # Report successful load and basic row count for validation
    print(f"Successfully loaded data from {input_path}. Initial rows: {len(df_updatecode):,}")
    print("\n=== Preview of loaded data ===")
    display(df_updatecode.head())

    # Print columns read by pandas to verify header and column names
    print(f"Columns read by pandas: {df_updatecode.columns.tolist()}")

    # Ensure the original value column exists
    if VALUE_COL_ORIGINAL not in df_updatecode.columns:
        # If the expected "Value" column is missing, raise a clear error
        raise KeyError(
            f"Expected column '{VALUE_COL_ORIGINAL}' not found in DataFrame after reading. "
            f"Available columns: {df_updatecode.columns.tolist()}"
        )

    # --- Explicitly count missing values including "Ns" and "N" in the original value column ---
    # Normalize the original value column to stripped lowercase strings for missingness checks
    val_original = df_updatecode[VALUE_COL_ORIGINAL].astype(str).str.strip()
    # Build a mask that flags NaN, empty strings, and specific placeholder values as missing
    missing_mask_original = (
        val_original.isna()
        | (val_original == "")
        | val_original.str.lower().isin(["ns", "n"])
    )

    # Count how many entries in the original value column are considered missing
    missing_values_count = missing_mask_original.sum()
    print(f"\nNumber of missing values (NaN, empty, 'Ns', 'N') in '{VALUE_COL_ORIGINAL}': {missing_values_count:,}")

    # Remove the leading "S" from the original Value column
    df_updatecode[VALUE_COL_ORIGINAL] = (
        df_updatecode[VALUE_COL_ORIGINAL]
        .astype(str)                         # Ensure string type for consistent string operations
        .str.replace(r"^S", "", regex=True)  # Remove a leading "S" character using a regex pattern
        .str.strip()                         # Strip leading and trailing whitespace
    )

    print(f"\n--- After removing leading 'S' from '{VALUE_COL_ORIGINAL}' column ---")
    display(df_updatecode.head())

    # --- Rename the original 'Value' column to 'UpdateCode' ---
    df_updatecode = df_updatecode.rename(columns={VALUE_COL_ORIGINAL: VALUE_COL_RENAMED})
    print(f"\nRenamed '{VALUE_COL_ORIGINAL}' column to '{VALUE_COL_RENAMED}'.")

    print("\n=== Preview after renaming column ===")
    display(df_updatecode.head())

    # Save the resulting dataset as v2 (header=True by default)
    df_updatecode.to_csv(output_path_v2, sep="|", index=False)

    # Confirm where the processed file has been written
    print(f"\nProcessed data saved to: {output_path_v2}")

# Handle case where the input file cannot be found at the specified path
except FileNotFoundError:
    print(
        f"Error: Input file not found at {input_path}. "
        f"Please ensure the file was created in the previous steps."
    )

# Catch any other unexpected exceptions and report them
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")


Successfully loaded data from /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_UpdateCode.txt. Initial rows: 8,418,018

=== Preview of loaded data ===


Unnamed: 0,ID,PIT Date,Frequency,FiscalPeriod,ItemCode,Value
0,C02500770,1995-12-29,A,1985,57034,S3
1,C02500770,1995-12-29,A,1986,57034,S3
2,C02500770,1995-12-29,A,1987,57034,S3
3,C02500770,1995-12-29,A,1988,57034,S3
4,C02500770,1995-12-29,A,1989,57034,S3


Columns read by pandas: ['ID', 'PIT Date', 'Frequency', 'FiscalPeriod', 'ItemCode', 'Value']

Number of missing values (NaN, empty, 'Ns', 'N') in 'Value': 2,873

--- After removing leading 'S' from 'Value' column ---


Unnamed: 0,ID,PIT Date,Frequency,FiscalPeriod,ItemCode,Value
0,C02500770,1995-12-29,A,1985,57034,3
1,C02500770,1995-12-29,A,1986,57034,3
2,C02500770,1995-12-29,A,1987,57034,3
3,C02500770,1995-12-29,A,1988,57034,3
4,C02500770,1995-12-29,A,1989,57034,3



Renamed 'Value' column to 'UpdateCode'.

=== Preview after renaming column ===


Unnamed: 0,ID,PIT Date,Frequency,FiscalPeriod,ItemCode,UpdateCode
0,C02500770,1995-12-29,A,1985,57034,3
1,C02500770,1995-12-29,A,1986,57034,3
2,C02500770,1995-12-29,A,1987,57034,3
3,C02500770,1995-12-29,A,1988,57034,3
4,C02500770,1995-12-29,A,1989,57034,3



Processed data saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_UpdateCode_v2.txt


#### Filtering Empty Values and Final Cleaning of Update Code Data

In [92]:
# Summary:
# This cell loads a preprocessed Update Code dataset from disk, validates that the
# expected "UpdateCode" column is present, and then performs a series of cleaning
# and filtering steps before saving a final, cleaned version:
# - It identifies and removes rows where the UpdateCode value is missing, empty,
#   or represents missingness via special codes such as "Ns" or "N".
# - It drops the "ItemCode" column (if present), as it is not needed for the final
#   output structure.
# - It preserves all other columns and writes the cleaned, filtered data to a new
#   pipe-separated text file.
# The focus is on ensuring that only valid, non-missing UpdateCode entries remain
# and that the resulting file is ready for downstream analysis or merging.

# Paths
input_path = f'{Temp_file_path_EoC}/filtered_UpdateCode_v2.txt'  # Path to the intermediate UpdateCode file
output_path = f'{Temp_file_path_EoC}/UpdateCodes_clean.txt'      # Path for the final cleaned UpdateCode output

# Assuming the v2 file has a header row and the column is named 'UpdateCode'

try:
    # Load the v2 data as a pipe-separated text file.
    # The first row is treated as a header and all columns are read as strings.
    df_updatecode_v2 = pd.read_csv(
        input_path,
        sep="|",
        encoding="utf-8",
        dtype=str,
        header=0,          # Interpret the first row as column names
        keep_default_na=True  # Use pandas' default NA parsing for standard missing markers
    )

    # Report successful load and show basic information about the dataset size
    print(f"Successfully loaded data from {input_path}. Initial rows: {len(df_updatecode_v2):,}")

    # Display a preview of the loaded data for visual inspection of structure and values
    print("\n=== Preview of loaded data ===")
    display(df_updatecode_v2.head())

    # Ensure the 'UpdateCode' column exists before continuing with transformations
    VALUE_COL_CLEANED = "UpdateCode"
    if VALUE_COL_CLEANED not in df_updatecode_v2.columns:
         raise KeyError(
             f"Expected column '{VALUE_COL_CLEANED}' not found in DataFrame. "
             f"Available columns: {df_updatecode_v2.columns.tolist()}"
         )

    print("\n--- Filtering empty/missing values, removing 'ItemCode' column, and saving ---")

    # Standardize the UpdateCode values as strings and strip leading/trailing whitespace
    val_cleaned = df_updatecode_v2[VALUE_COL_CLEANED].astype(str).str.strip()

    # Build a mask for rows where UpdateCode is considered missing:
    # - NaN values
    # - Empty strings
    # - Special codes "Ns" or "N" in a case-insensitive manner
    missing_mask_cleaned = (
        val_cleaned.isna()
        | (val_cleaned == "")
        | val_cleaned.str.lower().isin(["ns", "n"])
    )

    # Filter out rows with missing UpdateCode by inverting the missing mask
    initial_rows = len(df_updatecode_v2)
    df_updatecode_filtered = df_updatecode_v2[~missing_mask_cleaned].copy()
    rows_after_filtering = len(df_updatecode_filtered)

    # Report how many rows remain after removing missing or invalid UpdateCode entries
    print(f"Initial rows: {initial_rows:,}")
    print(f"Rows after filtering missing values: {rows_after_filtering:,}")

    # Drop the ItemCode column, if present, since it is not needed in the final output
    if "ItemCode" in df_updatecode_filtered.columns:
        df_updatecode_clean = df_updatecode_filtered.drop(columns=["ItemCode"], errors="ignore").copy()
        print("Dropped 'ItemCode' column.")
    else:
        # If ItemCode does not exist, just propagate the filtered DataFrame unchanged
        df_updatecode_clean = df_updatecode_filtered.copy()
        print("Warning: 'ItemCode' column not found to drop.")

    # Show a small sample of the cleaned data to verify the structure and content
    print("\n=== Preview of cleaned data ===")
    display(df_updatecode_clean.head())

    # Save the cleaned DataFrame as a pipe-separated text file without an index
    df_updatecode_clean.to_csv(output_path, sep="|", index=False, encoding="utf-8")

    # Confirm successful save and report the output path
    print(f"\nCleaned Update Code data saved to: {output_path}")

# Handle the case where the expected input file does not exist
except FileNotFoundError:
    print(
        f"Error: Input file not found at {input_path}. "
        f"Please ensure the previous processing steps were executed successfully."
    )

# Handle any other unexpected exceptions and print the error message
except Exception as e:
    print(f"An error occurred during processing: {e}")


Successfully loaded data from /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/filtered_UpdateCode_v2.txt. Initial rows: 8,418,018

=== Preview of loaded data ===


Unnamed: 0,ID,PIT Date,Frequency,FiscalPeriod,ItemCode,UpdateCode
0,C02500770,1995-12-29,A,1985,57034,3
1,C02500770,1995-12-29,A,1986,57034,3
2,C02500770,1995-12-29,A,1987,57034,3
3,C02500770,1995-12-29,A,1988,57034,3
4,C02500770,1995-12-29,A,1989,57034,3



--- Filtering empty/missing values, removing 'ItemCode' column, and saving ---
Initial rows: 8,418,018
Rows after filtering missing values: 8,415,145
Dropped 'ItemCode' column.

=== Preview of cleaned data ===


Unnamed: 0,ID,PIT Date,Frequency,FiscalPeriod,UpdateCode
0,C02500770,1995-12-29,A,1985,3
1,C02500770,1995-12-29,A,1986,3
2,C02500770,1995-12-29,A,1987,3
3,C02500770,1995-12-29,A,1988,3
4,C02500770,1995-12-29,A,1989,3



Cleaned Update Code data saved to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempExtractionofCharacteristics/UpdateCodes_clean.txt
