### Mount Google Drive, Import Libraries and Define Paths

In [93]:
# =============================================================================
# ENVIRONMENT SETUP + PATH CONFIGURATION (SERVER / COLAB COMPATIBLE)
# =============================================================================

import os
import sys
import importlib
from pathlib import Path
import string
import re
import gc

# -----------------------------------------------------------------------------
# 0) HARD SAFETY: cap native thread usage (prevents pthread_create EAGAIN)
#    MUST be set before importing numpy / scipy / pandas
# -----------------------------------------------------------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["BLIS_NUM_THREADS"] = "1"

# -----------------------------------------------------------------------------
# 1) Detect environment
# -----------------------------------------------------------------------------
IN_COLAB = "google.colab" in sys.modules

# -----------------------------------------------------------------------------
# 2) (Colab only) Mount Google Drive
# -----------------------------------------------------------------------------
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    BASE_PATH = "/content/drive/MyDrive/Colab Notebooks"
else:
    # Server base path (your target)
    BASE_PATH = "/home/jovyan/work/hpool1/pseidel/test"

print("IN_COLAB:", IN_COLAB)
print("BASE_PATH:", BASE_PATH)

# -----------------------------------------------------------------------------
# 3) Sanity checks: path exists + write permission
# -----------------------------------------------------------------------------
BASE = Path(BASE_PATH)
if not BASE.exists():
    raise FileNotFoundError(f"BASE_PATH does not exist: {BASE}")

# quick write test (fails fast if you don't have permissions)
test_file = BASE / ".write_test_tmp"
try:
    test_file.write_text("ok", encoding="utf-8")
    test_file.unlink()
except Exception as e:
    raise PermissionError(f"No write permission in {BASE}. Error: {e}")

# -----------------------------------------------------------------------------
# 4) Environment check: ensure required packages import cleanly
# -----------------------------------------------------------------------------
required_packages = ["numpy", "scipy", "pandas", "linearmodels", "xlsxwriter"]

for pkg in required_packages:
    print(f"Importing {pkg} ...")
    importlib.import_module(pkg)
    print(f"{pkg} OK")

# -----------------------------------------------------------------------------
# 5) Base paths and input/output locations
# -----------------------------------------------------------------------------
Input_file_path   = str(BASE / "Input")
Temp_file_path    = str(BASE / "Temp")
Output_file_path  = str(BASE / "Output")

Fundamentals_file_path = f"{Input_file_path}/WSFV_f_20250131.txt"
Current_file_path      = f"{Input_file_path}/WSCurrent_f_20250131.txt"
Calendar_file_path     = f"{Input_file_path}/WSCalendarPrd_f_20250131.txt"
Meta_file_path         = f"{Input_file_path}/WSMetaData_f_20250131.txt"
Excel_file_path        = f"{Input_file_path}/WS PIT Table Definitions V5 with start dates.xls"

MarketValues_file_path          = f"{Input_file_path}/Daily MV USD"
MarketValues_file_path_LC       = f"{Input_file_path}/Daily MV LC"
DailyTotalReturns_file_path     = f"{Input_file_path}/Daily Returns USD"
DailyIndexReturns_file_path     = f"{Input_file_path}/Daily Index Returns USD"
Constituents_file_path          = f"{Input_file_path}/Constituents.01.csv"
UniversalMatching_file_path     = f"{Input_file_path}/Universal Matching File"

Temp_file_path_GO  = f"{Temp_file_path}/TempGeneralOverview"
Temp_file_path_EoC = f"{Temp_file_path}/TempExtractionofCharacteristics"
Temp_file_path_DP  = f"{Temp_file_path}/TempDataPreparation"
Temp_file_path_A   = f"{Temp_file_path}/TempAnomalies"
Temp_file_path_R   = f"{Temp_file_path}/TempRegressionModel"

Relevant_items_path   = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_A = f"{Input_file_path}/RelevantItems.txt"
Relevant_items_path_B = f"{Input_file_path}/RelevantItemsB.txt"
Relevant_items_path_C = f"{Input_file_path}/RelevantItemsC.txt"
Relevant_items_path_D = f"{Input_file_path}/RelevantItemsD.txt"

Subset_file_path = f"{Temp_file_path_GO}/Subsets"
Fundamentals_clean_file_path = f"{Temp_file_path_GO}/Fundamentals_clean.txt"
Current_clean_file_path      = f"{Temp_file_path_GO}/Current_clean.txt"
Calendar_clean_file_path     = f"{Temp_file_path_GO}/Input/Calendar_clean.txt"
Meta_clean_file_path         = f"{Temp_file_path_GO}/Input/Meta_clean.txt"

# -----------------------------------------------------------------------------
# 6) Ensure required directories exist
# -----------------------------------------------------------------------------
Path(Output_file_path).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_GO).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_EoC).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_DP).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_A).mkdir(parents=True, exist_ok=True)
Path(Temp_file_path_R).mkdir(parents=True, exist_ok=True)
Path(Subset_file_path).mkdir(parents=True, exist_ok=True)
Path(Path(Calendar_clean_file_path).parent).mkdir(parents=True, exist_ok=True)

# -----------------------------------------------------------------------------
# 7) Streaming / deduplication settings
# -----------------------------------------------------------------------------
CHUNK_SIZE = 2_000_000
DATE_COL = "PIT Date"
DEDUP_KEYS = ["ID", "ItemCode", DATE_COL]

print("Paths configured. Temp outputs ->", Temp_file_path_GO)
print("Example input path ->", Fundamentals_file_path)


IN_COLAB: False
BASE_PATH: /home/jovyan/work/hpool1/pseidel/test
Importing numpy ...
numpy OK
Importing scipy ...
scipy OK
Importing pandas ...
pandas OK
Importing linearmodels ...
linearmodels OK
Importing xlsxwriter ...
xlsxwriter OK
Paths configured. Temp outputs -> /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview
Example input path -> /home/jovyan/work/hpool1/pseidel/test/Input/WSFV_f_20250131.txt


In [94]:
!free -h

               total        used        free      shared  buff/cache   available
Mem:           754Gi       196Gi       292Gi        55Mi       274Gi       558Gi
Swap:             0B          0B          0B


# 1.0. Import Data for Characteristics Dataset

### Import Data Files to DataFrames

In [95]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell:
#
#   1. Defines a helper function `import_file_to_dataframe` that reads a pipe-delimited
#      text file into a pandas DataFrame (all columns as string; returns None on error).
#   2. Imports a list of "input" files from Input_file_path into DataFrames
#      (RelevantItems, CountryCodes, ...), storing them in globals() by filename.
#   3. Imports a list of "temp" files from Temp_file_path_EoC into DataFrames
#      (ADR_clean, CompanyName_clean, CurrencyCodes_clean, FYE_clean, ID_clean,
#       UpdateCodes_clean, ValueCoding), also stored in globals().
#   4. Identifies which subset_*.txt files exist in Subset_file_path based on the IDs
#      listed in RelevantItems.txt, and records their names (without .txt) in
#      `successful_subset_names`.
#
# No actual subset data is loaded here; that is deferred to later steps to keep
# memory usage under control.


# Function to import a file and return a pandas DataFrame
def import_file_to_dataframe(file_path):
    """
    Import a pipe-separated text file as a pandas DataFrame.

    - Uses sep='|' to read pipe-delimited files.
    - Reads all columns as strings (dtype=str), which helps preserve things like
      leading zeros in codes (e.g., NatCo, ItemCode).
    - Returns None on failure and prints a brief error message.
    """
    try:
        df = pd.read_csv(file_path, sep='|', dtype=str)
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error importing file {file_path}: {e}")
        return None


# -------------------------------------------------------------------------
# Import files from Input directory
# -------------------------------------------------------------------------
input_files_to_import = ["RelevantItems.txt", "CountryCodes.txt"]

for file_name in input_files_to_import:
    file_path = os.path.join(Input_file_path, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "RelevantItems"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Import files from Temp directory (end-of-cleaning stage)
# -------------------------------------------------------------------------
temp_files_to_import = [
    "ADR_clean.txt",
    "CompanyName_clean.txt",
    "CurrencyCodes_clean.txt",
    "FYE_clean.txt",
    "ID_clean.txt",
    "UpdateCodes_clean.txt",
    "ValueCoding.txt"
]

for file_name in temp_files_to_import:
    file_path = os.path.join(Temp_file_path_EoC, file_name)
    var_name = file_name.replace(".txt", "")  # e.g. "ADR_clean"
    globals()[var_name] = import_file_to_dataframe(file_path)

    if globals()[var_name] is not None:
        print(f"\nImported {file_name} as DataFrame '{var_name}'")
        print(f"Preview of '{var_name}':")
        print(globals()[var_name].head(), "\n")


# -------------------------------------------------------------------------
# Identify subset files that exist for the relevant items
# -------------------------------------------------------------------------
successful_subset_names = []

if 'RelevantItems' in globals() and RelevantItems is not None:
    # Assume first column of RelevantItems holds the item IDs used in subset filenames
    relevant_ids = RelevantItems.iloc[:, 0].astype(str).tolist()

    print("\nIdentifying subset files to process...")
    for item_id in relevant_ids:
        file_name = f"subset_{item_id}.txt"
        file_path = os.path.join(Subset_file_path, file_name)

        # Check the existence of each candidate subset file
        if os.path.exists(file_path):
            successful_subset_names.append(f"subset_{item_id}")
            print(f"  Found {file_name}")
        else:
            print(f"  File not found: {file_name}. Skipping.")

    print(f"\nIdentified {len(successful_subset_names)} subset files for processing.")
else:
    print("RelevantItems DataFrame not found or is empty. Cannot identify subset files.")

# Note: actual loading and processing of subset files happens later, in
# batch-based steps, to manage memory usage.



Imported RelevantItems.txt as DataFrame 'RelevantItems'
Preview of 'RelevantItems':
  ItemCode
0    01001
1    01051
2    01075
3    01101
4    01151 


Imported CountryCodes.txt as DataFrame 'CountryCodes'
Preview of 'CountryCodes':
  NatCo ImplCountry
0   012     Algeria
1   440   Lithuania
2   025   Argentina
3   442  Luxembourg
4   036   Australia 


Imported ADR_clean.txt as DataFrame 'ADR_clean'
Preview of 'ADR_clean':
          ID ADRIndicator
0  C036F63D0            N
1  C056879S0            X
2  C2461T100            N
3  C2504O500            N
4  C250C9180            N 


Imported CompanyName_clean.txt as DataFrame 'CompanyName_clean'
Preview of 'CompanyName_clean':
          ID                               CompanyName
0  C00948205             AGRIFORCE GROWING SYSTEMS LTD
1  C02500770            PEUGEOT CITROEN ARGENTINA S.A.
2  C02520200  ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA
3  C02520220                       ALPARGATAS S.A.I.C.
4  C02520230               ALUAR ALUMINI

### Check Unique IDs in DataFrames (Takes long, just Overview)

In [96]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This diagnostic cell is intended for emergency checks only.
#
# It does two things:
#   1) For a set of key DataFrames (CompanyName_clean, CurrencyCodes_clean, FYE_clean,
#      ID_clean, UpdateCodes_clean, ADR_clean, RelevantItems, ValueCoding), it prints
#      how many unique IDs each one contains.
#   2) For each subset_X file listed in `successful_subset_names`, it loads the file,
#      counts the number of unique IDs, prints that count, and then immediately frees
#      the DataFrame from memory.
#
# The goal is to compare ID coverage across all relevant datasets and subsets while
# keeping memory usage low by only loading one subset at a time.

# Nur im Notfall laufen lassen
# (Only run this for debugging / sanity checks; it can be slow on large datasets.)

# List of DataFrames to check for unique IDs
dataframes_to_check = {
    'CompanyName_clean': CompanyName_clean,
    'CurrencyCodes_clean': CurrencyCodes_clean,
    'FYE_clean': FYE_clean,
    'ID_clean': ID_clean,
    'UpdateCodes_clean': UpdateCodes_clean,
    'ADR_clean': ADR_clean,
    'RelevantItems': RelevantItems,
    'ValueCoding': ValueCoding,
}

print("Number of unique IDs in each relevant DataFrame:")

for df_name, df in dataframes_to_check.items():
    if df is not None:
        # If an 'ID' column exists, use it; otherwise fall back to the first column
        id_column = 'ID' if 'ID' in df.columns else df.columns[0]
        if id_column in df.columns:
            unique_ids_count = df[id_column].nunique()
            print(f"- {df_name}: {unique_ids_count}")
        else:
            print(f"- {df_name}: Does not contain an 'ID' column or similar.")
    else:
        print(f"- {df_name}: DataFrame is None (not imported or empty).")

print("\nNumber of unique IDs in each subset DataFrame:")

# Process subset files one by one to get summary statistics and manage memory
if 'successful_subset_names' in globals() and successful_subset_names:
    for subset_name in successful_subset_names:
        # Derive raw item_id from subset_X name
        item_id = subset_name.replace("subset_", "")
        file_name = f"subset_{item_id}.txt"
        file_path = os.path.join(Subset_file_path, file_name)

        # Import the subset file
        subset_df = import_file_to_dataframe(file_path)

        if subset_df is not None:
            # Again, use 'ID' if present, otherwise the first column
            id_column = 'ID' if 'ID' in subset_df.columns else subset_df.columns[0]
            if id_column in subset_df.columns:
                unique_ids_count = subset_df[id_column].nunique()
                print(f"- {subset_name}: {unique_ids_count}")
            else:
                print(f"- {subset_name}: Does not contain an 'ID' column or similar.")

            # Explicitly delete the subset_df to free memory
            del subset_df
            gc.collect()
        else:
            print(f"- {subset_name}: Could not be imported.")
else:
    print("No successful subset file names found to process.")


Number of unique IDs in each relevant DataFrame:
- CompanyName_clean: 104061
- CurrencyCodes_clean: 121007
- FYE_clean: 120948
- ID_clean: 55367
- UpdateCodes_clean: 111895
- ADR_clean: 2297
- RelevantItems: 49
- ValueCoding: 477

Number of unique IDs in each subset DataFrame:
- subset_01001: 103974
- subset_01051: 95686
- subset_01075: 31347
- subset_01101: 93284
- subset_01151: 99264
- subset_01250: 103792
- subset_01451: 103210
- subset_01551: 104010
- subset_01706: 104003
- subset_02001: 98243
- subset_02051: 97864
- subset_02101: 97742
- subset_02149: 90304
- subset_02201: 90853
- subset_02250: 96558
- subset_02256: 101294
- subset_02257: 2367
- subset_02258: 82485
- subset_02263: 3934
- subset_02501: 101374
- subset_02652: 103894
- subset_02999: 103985
- subset_03040: 88043
- subset_03051: 103339
- subset_03063: 70004
- subset_03066: 89836
- subset_03101: 90852
- subset_03251: 103886
- subset_03263: 90934
- subset_03273: 103941
- subset_03351: 103982
- subset_03426: 103286
- subs

# 2.0. Create Characteristics DF

### Initialize Characteristics DataFrame

In [97]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block initializes a characteristics_df DataFrame that will hold core
# company-level attributes.
#
# Steps:
#   1. Check that ID_clean and CompanyName_clean exist, are not None, and are non-empty.
#   2. Normalize the 'ID' column in both DataFrames to string (for safe merging).
#   3. Build characteristics_df from the unique IDs in ID_clean.
#   4. Merge the corresponding 'CompanyName' from CompanyName_clean onto characteristics_df.
#   5. Print basic info (row count) and show a sample.
#   6. If prerequisites are missing, create an empty characteristics_df instead.
#   7. Trigger garbage collection at the end.


if (
    'ID_clean' in globals() and ID_clean is not None and not ID_clean.empty and
    'CompanyName_clean' in globals() and CompanyName_clean is not None and not CompanyName_clean.empty
):
    print("Initializing characteristics_df with ID_clean and merging CompanyName_clean...")

    # Ensure IDs are comparable between ID_clean and CompanyName_clean
    ID_clean['ID'] = ID_clean['ID'].astype(str)
    CompanyName_clean['ID'] = CompanyName_clean['ID'].astype(str)

    # Start from unique IDs only to avoid duplicates in the base characteristics table
    characteristics_df = ID_clean[['ID']].drop_duplicates().copy()

    # Merge CompanyName from CompanyName_clean onto characteristics_df by ID
    characteristics_df = characteristics_df.merge(
        CompanyName_clean[['ID', 'CompanyName']].drop_duplicates(),
        on='ID',
        how='left'
    )

    # Column already named 'CompanyName'; rename is effectively a no-op but kept for clarity
    characteristics_df.rename(columns={'CompanyName': 'CompanyName'}, inplace=True)

    print(f"Initial characteristics_df created with {len(characteristics_df):,} rows.")
    display(characteristics_df.head())

else:
    # If either ID_clean or CompanyName_clean is missing or empty, initialize an empty DataFrame
    print("ID_clean or CompanyName_clean not found or empty. Cannot initialize characteristics_df.")
    characteristics_df = pd.DataFrame()

# Run garbage collection to clean up any unused objects
gc.collect()


Initializing characteristics_df with ID_clean and merging CompanyName_clean...
Initial characteristics_df created with 55,367 rows.


Unnamed: 0,ID,CompanyName
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA
2,C02520220,ALPARGATAS S.A.I.C.
3,C02520230,ALUAR ALUMINIO ARGENTINO SA
4,C02520240,ASTRA COMPANIA ARGENTINA DE PETROLEO SA


0

### Extract and Merge Implied Country

In [98]:
# Extract the 2nd to 4th digit from the 'ID' column and create 'ImplNatCo'
characteristics_df['ImplNatCo'] = characteristics_df['ID'].str[1:4]

# Convert 'NatCo' in CountryCodes to string type to ensure consistent data types for merging
CountryCodes['NatCo'] = CountryCodes['NatCo'].astype(str)

# Merge characteristics_df with CountryCodes
characteristics_df = pd.merge(characteristics_df, CountryCodes, left_on='ImplNatCo', right_on='NatCo', how='left')

# Drop the redundant 'NatCo' column after merging
characteristics_df.drop('NatCo', axis=1, inplace=True)

# Display the first few rows of the updated DataFrame
display(characteristics_df.head())

# Check for empty values per column
empty_values_per_column = characteristics_df.isnull().sum()

# Display the results
print("Number of empty values per column:")
display(empty_values_per_column)

# Print the number of unique IDs in characteristics_df
print(f"\nNumber of unique IDs in characteristics_df: {characteristics_df['ID'].nunique()}")

Unnamed: 0,ID,CompanyName,ImplNatCo,ImplCountry
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,25,Argentina
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,25,Argentina
2,C02520220,ALPARGATAS S.A.I.C.,25,Argentina
3,C02520230,ALUAR ALUMINIO ARGENTINO SA,25,Argentina
4,C02520240,ASTRA COMPANIA ARGENTINA DE PETROLEO SA,25,Argentina


Number of empty values per column:


ID             0
CompanyName    0
ImplNatCo      0
ImplCountry    0
dtype: int64


Number of unique IDs in characteristics_df: 55367


### Merge Current Currency Data

In [99]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block enriches characteristics_df by adding each entity's CurrentCurrency
# from CurrencyCodes_clean.
#
# Steps:
#   1. Select only the necessary columns (ID, CurrentCurrency) from CurrencyCodes_clean.
#   2. Perform a left-merge onto characteristics_df using ID as the join key.
#   3. Display a preview of the updated characteristics_df.
#   4. Count how many missing (NaN) values appear in each column after the merge.
#   5. Report the number of unique IDs present.


# Extract the minimal set of columns needed for the merge
currency_subset = CurrencyCodes_clean[['ID', 'CurrentCurrency']].copy()

# Merge currency data onto characteristics_df by ID
characteristics_df = pd.merge(characteristics_df, currency_subset, on='ID', how='left')

# Show a sample of the enriched DataFrame
display(characteristics_df.head())

# Count missing values across all columns
empty_values_per_column = characteristics_df.isnull().sum()

print("Number of empty values per column:")
display(empty_values_per_column)

# Report the number of unique IDs
print(f"\nNumber of unique IDs in characteristics_df: {characteristics_df['ID'].nunique()}")


Unnamed: 0,ID,CompanyName,ImplNatCo,ImplCountry,CurrentCurrency
0,C02500770,PEUGEOT CITROEN ARGENTINA S.A.,25,Argentina,Ars
1,C02520200,ACINDAR INDUSTRIA ARGENTINA DE ACEROS SA,25,Argentina,Ars
2,C02520220,ALPARGATAS S.A.I.C.,25,Argentina,Ars
3,C02520230,ALUAR ALUMINIO ARGENTINO SA,25,Argentina,Ars
4,C02520240,ASTRA COMPANIA ARGENTINA DE PETROLEO SA,25,Argentina,Ars


Number of empty values per column:


ID                 0
CompanyName        0
ImplNatCo          0
ImplCountry        0
CurrentCurrency    0
dtype: int64


Number of unique IDs in characteristics_df: 55367


### Save Cleaned Characteristics DataFrame

In [100]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This block writes the finalized characteristics_df to disk as
# Characteristics_clean.txt.
#
# Steps:
#   1. Construct the output file path inside Temp_file_path_DP.
#   2. Remove the ImplNatCo column before saving.
#   3. Save the cleaned DataFrame as a pipe-delimited text file.
#   4. Confirm completion.


# Define the output path for the characteristics file
output_file = os.path.join(Temp_file_path_DP, "Characteristics_clean.txt")

# Remove ImplNatCo prior to saving (ADRIndicator was already absent or removed)
characteristics_df_to_save = characteristics_df.drop(columns=['ImplNatCo'])

# Save characteristics_df as a pipe-delimited text file
characteristics_df_to_save.to_csv(output_file, sep='|', index=False)

print(f"DataFrame saved successfully to {output_file}")


DataFrame saved successfully to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/Characteristics_clean.txt



# 3.0. Preparation of Main Data

## Worldscope PIT

### Merge Subset Data with Characteristics (Batch Processing) and Removal of Rows (e.g. ADR/Security Rows Which are still in DFs but not in ID_clean)

In [101]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell merges many subset_*.txt files with the master characteristics_clean_df.
# Imports and library loading have been intentionally removed (you said you will add
# them at the beginning).
#
# The workflow:
#   1. Load Characteristics_clean.txt into characteristics_clean_df.
#   2. Determine which subset files should be processed (successful_subset_names).
#   3. Extract valid IDs from characteristics_clean_df to filter subset files.
#   4. Process subset files in batches to avoid high memory usage:
#         - Read subset file from disk
#         - Filter rows by valid IDs
#         - Merge with characteristics_clean_df (left merge on "ID")
#         - Reorder columns so characteristics appear first
#         - Save result as work_subset_<name>.txt in Temp_file_path_DP
#         - Delete temporary DataFrames to free memory
#   5. Print progress messages and summary at the end.
#
# No merged DataFrames are kept in memory — everything is streamed batch-wise.


# Iterate and merge for all subset DataFrames in batches to manage memory
merged_subset_files = {}
batch_size = 10  # Adjust batch size as needed

# Path of Characteristics_clean dataset
characteristics_clean_file_path = os.path.join(Temp_file_path_DP, "Characteristics_clean.txt")

# Load Characteristics_clean.txt
characteristics_clean_df = import_file_to_dataframe(characteristics_clean_file_path)

if characteristics_clean_df is not None and not characteristics_clean_df.empty:
    print(f"Loaded characteristics data from {characteristics_clean_file_path}")

    # Save list of characteristics columns so we can reorder merged files later
    characteristics_cols = characteristics_clean_df.columns.tolist()

    # Retrieve set of subset names from earlier processing
    if 'successful_subset_names' in globals() and successful_subset_names:
        subset_names_to_process = successful_subset_names
        print(f"Identified {len(subset_names_to_process)} subset files to process.")
    else:
        subset_names_to_process = []
        print("No successful subset file names found to process.")

    # Ensure target directory exists
    os.makedirs(Temp_file_path_DP, exist_ok=True)

    # Extract the list of valid IDs from the characteristics file
    valid_ids = []
    if 'ID' in characteristics_clean_df.columns:
        valid_ids = characteristics_clean_df['ID'].tolist()
        print(f"\nFiltering subset files to include only IDs present in characteristics_clean_df ({len(valid_ids)} valid IDs).")
    else:
        print("\ncharacteristics_clean_df has no 'ID' column. Skipping ID filtering.")

    # ----------------------------------------------------------------------
    # Process subset files in batches
    # ----------------------------------------------------------------------
    for i in range(0, len(subset_names_to_process), batch_size):
        batch_names = subset_names_to_process[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1} of subsets: {batch_names}")

        for subset_name in batch_names:

            # Reconstruct file name on disk
            item_id = subset_name.replace("subset_", "")
            file_name = f"subset_{item_id}.txt"
            file_path = os.path.join(Subset_file_path, file_name)

            print(f"  Reading {file_name} from disk...")
            subset_df = import_file_to_dataframe(file_path)

            if subset_df is not None:
                print(f"  Processing {subset_name} (shape: {subset_df.shape})")

                # ---------------------------
                # Apply ID filtering
                # ---------------------------
                if valid_ids and 'ID' in subset_df.columns:
                    initial_rows = len(subset_df)
                    subset_df = subset_df[subset_df['ID'].isin(valid_ids)].copy()
                    removed = initial_rows - len(subset_df)
                    print(f"    Filtered {subset_name}: {removed} rows removed.")
                elif not valid_ids:
                    print(f"    No valid IDs available — skipping ID filtering.")
                else:
                    print(f"    'ID' column missing — skipping ID filtering.")

                # ---------------------------
                # Merge with characteristics_clean_df
                # ---------------------------
                print(f"  Merging filtered {subset_name} (shape: {subset_df.shape}) with characteristics_clean_df")
                merged_df = pd.merge(subset_df, characteristics_clean_df, on='ID', how='left')

                # ---------------------------
                # Reorder columns: characteristics first
                # ---------------------------
                subset_cols = [c for c in merged_df.columns if c not in characteristics_cols]
                merged_df = merged_df[characteristics_cols + subset_cols]

                # ---------------------------
                # Save merged result
                # ---------------------------
                work_subset_name = subset_name.replace("subset_", "work_subset_")
                output_file = os.path.join(Temp_file_path_DP, f"{work_subset_name}.txt")
                merged_df.to_csv(output_file, sep='|', index=False)
                print(f"    Saved {work_subset_name} to {output_file}")

                # Free memory immediately
                del subset_df
                del merged_df
                gc.collect()

    print("\nMerging complete. All valid subset DataFrames were merged with characteristics_clean_df and saved as work_subset_X.")
else:
    print(f"Error: Could not load characteristics data from {characteristics_clean_file_path}. Skipping subset merging.")


Loaded characteristics data from /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/Characteristics_clean.txt
Identified 49 subset files to process.

Filtering subset files to include only IDs present in characteristics_clean_df (55367 valid IDs).
Processing batch 1 of subsets: ['subset_01001', 'subset_01051', 'subset_01075', 'subset_01101', 'subset_01151', 'subset_01250', 'subset_01451', 'subset_01551', 'subset_01706', 'subset_02001']
  Reading subset_01001.txt from disk...
  Processing subset_01001 (shape: (6805130, 6))
    Filtered subset_01001: 2802001 rows removed.
  Merging filtered subset_01001 (shape: (4003129, 6)) with characteristics_clean_df
    Saved work_subset_01001 to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/work_subset_01001.txt
  Reading subset_01051.txt from disk...
  Processing subset_01051 (shape: (6188353, 6))
    Filtered subset_01051: 1961350 rows removed.
  Merging filtered subset_01051 (shape: (4227003, 6)) with characteristics

### Check for Missing Values (Should be 0) and Delete

In [102]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell removes all rows where the column "Value" contains the string "n"
# from each work_subset_*.txt file in Temp_file_path_DP.
#
# Steps:
#   1. Identify all work_subset files.
#   2. Load each file.
#   3. If the column "Value" exists, remove rows where Value == "n".
#   4. Overwrite the file with the cleaned version.
#   5. Record how many rows were removed per file.
#   6. Print a summary at the end.


print("Dropping rows with 'n' in Value column from work_subset files...")

# Find all work_subset files in the Temp directory
work_subset_file_names = [
    f for f in os.listdir(Temp_file_path_DP)
    if f.startswith('work_subset_') and f.endswith('.txt')
]

# Dictionary to store the count of dropped rows for each file
dropped_rows_summary_work_subsets = {}

# Process each work_subset file
for file_name in work_subset_file_names:
    ws_name = file_name.replace(".txt", "")
    file_path = os.path.join(Temp_file_path_DP, file_name)

    print(f"\n--- Processing {ws_name} ---")

    ws_df = import_file_to_dataframe(file_path)

    if ws_df is not None and 'Value' in ws_df.columns:
        rows_before_drop = len(ws_df)

        # Remove rows where Value == "n"
        ws_df_cleaned = ws_df[ws_df['Value'] != 'n'].copy()

        rows_after_drop = len(ws_df_cleaned)
        rows_dropped = rows_before_drop - rows_after_drop
        dropped_rows_summary_work_subsets[ws_name] = rows_dropped

        if rows_dropped > 0:
            print(f"  Dropped {rows_dropped} rows with 'n' in Value column.")
        else:
            print("  No rows with 'n' in Value column found.")

        # Save cleaned file
        ws_df_cleaned.to_csv(file_path, sep='|', index=False)
        print(f"  Saved cleaned {ws_name} back to {file_path}")

        del ws_df
        del ws_df_cleaned
        gc.collect()

    else:
        # Skip files without a Value column
        print(f"  'Value' column not found in {ws_name}. Skipping.")
        dropped_rows_summary_work_subsets[ws_name] = 'N/A - No Value column'


# Summary
print("\nSummary of rows dropped due to 'n' in Value column from work_subset files:")
for ws_name, count in dropped_rows_summary_work_subsets.items():
    print(f"- {ws_name}: {count} rows dropped")

print("\nDropping rows with 'n' from work_subsets complete.")


Dropping rows with 'n' in Value column from work_subset files...

--- Processing work_subset_01101 ---
  No rows with 'n' in Value column found.
  Saved cleaned work_subset_01101 back to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/work_subset_01101.txt

--- Processing work_subset_03351 ---
  No rows with 'n' in Value column found.
  Saved cleaned work_subset_03351 back to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/work_subset_03351.txt

--- Processing work_subset_04751 ---
  No rows with 'n' in Value column found.
  Saved cleaned work_subset_04751 back to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/work_subset_04751.txt

--- Processing work_subset_03263 ---
  No rows with 'n' in Value column found.
  Saved cleaned work_subset_03263 back to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/work_subset_03263.txt

--- Processing work_subset_04890 ---
  No rows with 'n' in Value column found.
  Saved cleaned work_subset

### Rename Work Subset Files

In [103]:
# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell renames work_subset_*.txt files that still use a numeric ItemCode
# in their filename (e.g. work_subset_1051.txt) to use the corresponding
# textual ItemName instead (e.g. work_subset_Revenue.txt).
#
# Steps:
#   1. Build a mapping from ItemCode -> ItemName from the ValueCoding DataFrame
#      (using the first column as ItemCode, second as ItemName).
#   2. Scan Temp_file_path_DP for work_subset_*.txt files whose names contain
#      only a numeric code.
#   3. For each such file:
#        - Look up its ItemCode in the mapping.
#        - If found, sanitize the ItemName into a filesystem-safe string.
#        - Build a new filename of the form work_subset_<sanitized_name>.txt.
#        - If a file with that new name already exists, delete it first.
#        - Rename the original numeric file to the new, name-based filename.
#        - Track counts for renamed, skipped, and error cases.
#   4. Print a final summary of the renaming process.


# Ensure ValueCoding DataFrame is available and usable
if 'ValueCoding' in globals() and ValueCoding is not None and not ValueCoding.empty:
    # Ensure there are at least two columns: first = ItemCode, second = ItemName
    if ValueCoding.shape[1] >= 2:
        # Build a mapping ItemCode (as string) -> ItemName
        # ValueCoding.iloc[:, 0]  -> first column (ItemCode)
        # ValueCoding.iloc[:, 1]  -> second column (ItemName)
        item_code_to_name = pd.Series(
            ValueCoding.iloc[:, 1].values,
            index=ValueCoding.iloc[:, 0].astype(str)  # convert to str for consistent lookup
        ).to_dict()
        print("Created ItemCode to ItemName mapping from ValueCoding.")
    else:
        item_code_to_name = {}
        print("ValueCoding DataFrame does not have enough columns for mapping.")
else:
    # Fallback: empty mapping if ValueCoding is not available
    item_code_to_name = {}
    print("ValueCoding DataFrame not found or is empty. Cannot rename subset files.")


# Get all files in the Temp directory
temp_files = os.listdir(Temp_file_path_DP)

# Select files that still follow the numeric naming pattern:
#   work_subset_<digits>.txt
work_subset_files_numerical_ids = [
    f for f in temp_files
    if re.match(r'work_subset_(\d+)\.txt$', f)
]

print(f"\nFound {len(temp_files)} files in Temp directory.")
print(f"Identified {len(work_subset_files_numerical_ids)} files with numerical IDs for potential renaming.")


# Counters for statistics
renamed_count = 0
deleted_existing_renamed_count = 0
skipped_count = 0
error_count = 0

# Iterate over each numerically-named work_subset file
for file_name in work_subset_files_numerical_ids:
    old_file_path = os.path.join(Temp_file_path_DP, file_name)

    # Extract the numeric ItemCode from the filename via regex
    match = re.match(r'work_subset_(\d+)\.txt$', file_name)
    if match:
        item_code = match.group(1)  # e.g. "1051"

        # Look up the corresponding ItemName in the mapping
        item_name = item_code_to_name.get(item_code)

        if item_name:
            # Sanitize the ItemName for use in filenames:
            #   - replace common separators with underscore
            #   - then remove any remaining disallowed characters
            sanitized_item_name = (
                item_name
                .replace(' ', '_')
                .replace('-', '_')
                .replace('/', '_')
                .replace('\\', '_')
                .replace(':', '_')
                .replace('*', '_')
                .replace('?', '_')
                .replace('"', '_')
                .replace('<', '_')
                .replace('>', '_')
                .replace('|', '_')
            )
            sanitized_item_name = re.sub(r'[^\w.-]', '', sanitized_item_name)

            # Build the new filename using the sanitized name
            new_file_name = f"work_subset_{sanitized_item_name}.txt"
            new_file_path = os.path.join(Temp_file_path_DP, new_file_name)

            # If the target name already exists, remove it so we can rename safely
            if os.path.exists(new_file_path):
                print(f"  Target renamed file '{new_file_name}' already exists. Deleting it before renaming.")
                try:
                    os.remove(new_file_path)
                    deleted_existing_renamed_count += 1
                except Exception as e:
                    # If deletion fails, record an error and skip this file
                    print(f"  Error deleting existing renamed file '{new_file_path}': {e}. Skipping rename for this file.")
                    error_count += 1
                    continue

            # Perform the actual rename from numeric to name-based filename
            try:
                os.rename(old_file_path, new_file_path)
                print(f"  Renamed '{file_name}' to '{new_file_name}'.")
                renamed_count += 1
            except OSError as e:
                print(f"  OSError renaming '{file_name}' to '{new_file_path}': {e}. Skipping rename.")
                error_count += 1
            except Exception as e:
                print(f"  Error renaming '{file_name}' to '{new_file_path}': {e}. Skipping rename.")
                error_count += 1

        else:
            # No mapping found for this ItemCode, so we cannot rename meaningfully
            print(f"  Could not find ItemName for ItemCode '{item_code}' in ValueCoding. Skipping processing for '{file_name}'.")
            skipped_count += 1
    else:
        # Should not happen due to the initial regex filter, kept as safety net
        print(f"  Filename format not as expected for '{file_name}'. Skipping processing.")
        skipped_count += 1

# Final summary of the renaming process
print(
    f"\nRenaming and cleanup process complete. "
    f"Successfully renamed: {renamed_count}, "
    f"Deleted existing renamed files: {deleted_existing_renamed_count}, "
    f"Skipped: {skipped_count}, "
    f"Errors: {error_count}"
)



Created ItemCode to ItemName mapping from ValueCoding.

Found 52 files in Temp directory.
Identified 49 files with numerical IDs for potential renaming.
  Renamed 'work_subset_01101.txt' to 'work_subset_Selling_General__Administrative_Expenses.txt'.
  Renamed 'work_subset_03351.txt' to 'work_subset_Total_Liabilities.txt'.
  Renamed 'work_subset_04751.txt' to 'work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc..txt'.
  Renamed 'work_subset_03263.txt' to 'work_subset_Deferred_Taxes.txt'.
  Renamed 'work_subset_04890.txt' to 'work_subset_Net_Cash_Flow___Financing.txt'.
  Renamed 'work_subset_03101.txt' to 'work_subset_Current_Liabilities___Total.txt'.
  Renamed 'work_subset_01001.txt' to 'work_subset_Net_Sales_or_Revenues.txt'.
  Renamed 'work_subset_04401.txt' to 'work_subset_Long_Term_Borrowings.txt'.
  Renamed 'work_subset_03251.txt' to 'work_subset_Long_Term_Debt.txt'.
  Renamed 'work_subset_03040.txt' to 'work_subset_Accounts_Payable.txt'.
  Renamed 'work_subset_03273.txt' to 'work_s

### Merge Currency, Update Codes, and FYE Data

In [104]:
import numpy as np

# =====================================================================================
# SUMMARY
# =====================================================================================
# This cell enriches all work_subset_*.txt files in Temp_file_path_DP with:
#
#   1. Historical currency (HistCurrency) computed from CurrencyCodes_clean and PIT Date.
#   2. Update codes from UpdateCodes_clean (matched by ID, PIT Date, Frequency, FiscalPeriod).
#   3. Fiscal year-end information (FYE Month) from FYE_clean (optional).
#
# Main steps:
#   - Prepare a slimmed-down CurrencyCodes_clean with numeric switch dates and renamed
#     columns (Currency1/2/3, Date1/2/3).
#   - Prepare a slimmed-down UpdateCodes_clean with normalized keys.
#   - Optionally prepare FYE_clean if present and valid.
#   - For each work_subset file:
#       * Load it, normalize key columns (PIT Date, Frequency, FiscalPeriod).
#       * Merge currency info and compute HistCurrency based on PIT Date vs Date1/2/3.
#       * Merge update codes.
#       * Merge FYE data (if available).
#       * Reorder columns (keeping all existing columns).
#       * Persist back to disk with PIT Date formatted as 'YYYY-MM-DD'.
#       * Clean up intermediate DataFrames to manage memory.


# --------------------------------------------------
# 1) Prepare CurrencyCodes_clean (currency + switch dates)
# --------------------------------------------------

# Select only the needed columns from CurrencyCodes_clean
currency_cols_to_merge = CurrencyCodes_clean[
    ['ID', 'CurrencyCode1', 'SwitchDate1',
     'CurrencyCode2', 'SwitchDate2',
     'CurrencyCode3', 'SwitchDate3']
].copy()

# Parse all SwitchDate* columns to datetime, assuming 'YYYY-MM-DD' format
for col in ['SwitchDate1', 'SwitchDate2', 'SwitchDate3']:
    currency_cols_to_merge[col] = pd.to_datetime(
        currency_cols_to_merge[col],
        format='%Y-%m-%d',
        errors='coerce'
    )

# Rename columns for clarity and to avoid name clashes during merge
currency_cols_to_merge.rename(columns={
    'CurrencyCode1': 'Currency1',
    'SwitchDate1': 'Date1',
    'CurrencyCode2': 'Currency2',
    'SwitchDate2': 'Date2',
    'CurrencyCode3': 'Currency3',
    'SwitchDate3': 'Date3'
}, inplace=True)

print("Prepared CurrencyCodes_clean for merging.")

# --------------------------------------------------
# 2) Prepare UpdateCodes_clean
# --------------------------------------------------

# Select only the columns needed for joining in UpdateCodes_clean
update_codes_to_merge = UpdateCodes_clean[
    ['ID', 'PIT Date', 'Frequency', 'FiscalPeriod', 'UpdateCode']
].copy()

# Parse PIT Date as datetime (format 'YYYY-MM-DD')
update_codes_to_merge['PIT Date'] = pd.to_datetime(
    update_codes_to_merge['PIT Date'],
    format='%Y-%m-%d',
    errors='coerce'
)

# Normalize Frequency to stripped string for consistent joins
update_codes_to_merge['Frequency'] = (
    update_codes_to_merge['Frequency'].astype(str).str.strip()
)

# Normalize FiscalPeriod to stripped string for consistent joins
update_codes_to_merge['FiscalPeriod'] = (
    update_codes_to_merge['FiscalPeriod'].astype(str).str.strip()
)

print("Prepared UpdateCodes_clean for merging.")

# --------------------------------------------------
# 3) Prepare FYE_clean (optional)
# --------------------------------------------------

# Check presence and validity of FYE_clean before using it
if (
    'FYE_clean' in globals()
    and FYE_clean is not None
    and all(col in FYE_clean.columns for col in ['ID', 'FY', 'FYE Month'])
):
    # Keep only relevant columns and rename FY -> FiscalPeriod
    fye_cols_to_merge = FYE_clean[['ID', 'FY', 'FYE Month']].copy()
    fye_cols_to_merge.rename(columns={'FY': 'FiscalPeriod'}, inplace=True)

    # Normalize FiscalPeriod to stripped string for joins
    fye_cols_to_merge['FiscalPeriod'] = (
        fye_cols_to_merge['FiscalPeriod'].astype(str).str.strip()
    )

    print("Prepared FYE_clean for merging.")
else:
    # If not available or invalid, skip FYE merge later
    fye_cols_to_merge = None
    print("FYE_clean DataFrame not found or invalid. FYE data will not be merged.")

# --------------------------------------------------
# 4) Process work_subset files
# --------------------------------------------------

summary_data = []  # Placeholder if you want to collect per-file stats later

print("Processing work_subset DataFrames: merging currency, update codes, FYE data, adding HistCurrency...")

# Find all work_subset_*.txt files in the temp directory
work_subset_file_names = [
    f for f in os.listdir(Temp_file_path_DP)
    if f.startswith('work_subset_') and f.endswith('.txt')
]

# Iterate over each work_subset file
for ws_file_name in work_subset_file_names:
    ws_name = ws_file_name.replace(".txt", "")  # e.g. work_subset_Revenue
    file_path = os.path.join(Temp_file_path_DP, ws_file_name)

    print(f"\n--- Processing {ws_name} ---")

    # Read the current work_subset file from disk
    ws_df = import_file_to_dataframe(file_path)

    if ws_df is None:
        # If loading fails, skip this file
        print(f"--- Could not load {ws_name} from disk. Skipping processing. ---")
        continue

    # --------------------------------------------------
    # 4.1 Normalize PIT Date & key columns in ws_df
    # --------------------------------------------------

    # Ensure PIT Date is parsed as datetime
    if 'PIT Date' in ws_df.columns:
        ws_df['PIT Date'] = pd.to_datetime(
            ws_df['PIT Date'],
            format='%Y-%m-%d',
            errors='coerce'
        )
    else:
        print(f"  Warning: 'PIT Date' column missing in {ws_name}.")

    # Normalize Frequency as stripped string
    if 'Frequency' in ws_df.columns:
        ws_df['Frequency'] = ws_df['Frequency'].astype(str).str.strip()

    # Normalize FiscalPeriod as stripped string
    if 'FiscalPeriod' in ws_df.columns:
        ws_df['FiscalPeriod'] = ws_df['FiscalPeriod'].astype(str).str.strip()

    # --------------------------------------------------
    # 4.2 Merge with CurrencyCodes_clean & compute HistCurrency
    # --------------------------------------------------

    # Merge currency switch information onto the subset by ID
    merged_df = pd.merge(ws_df, currency_cols_to_merge, on='ID', how='left')

    # PIT Date and Date1/2/3 are already datetime at this point

    # Build conditions for choosing HistCurrency based on PIT Date
    conditions = [
        # Case 1: Only first currency known (no Date2/Date3) and PIT >= Date1
        (merged_df['PIT Date'] >= merged_df['Date1']) &
        (merged_df['Date2'].isna()) &
        (merged_df['Date3'].isna()),

        # Case 2: between Date1 and Date2
        (merged_df['PIT Date'] >= merged_df['Date1']) &
        (merged_df['PIT Date'] < merged_df['Date2']),

        # Case 3: between Date2 and Date3 (or Date3 missing -> open-ended)
        (merged_df['PIT Date'] >= merged_df['Date2']) &
        (
            (merged_df['PIT Date'] < merged_df['Date3']) |
            (merged_df['Date3'].isna())
        ),

        # Case 4: on or after Date3
        (merged_df['PIT Date'] >= merged_df['Date3']) &
        (merged_df['Date3'].notna())
    ]

    # Corresponding currency choices for each condition above
    choices = [
        merged_df['Currency1'],  # Case 1
        merged_df['Currency1'],  # Case 2 (still Currency1)
        merged_df['Currency2'],  # Case 3
        merged_df['Currency3']   # Case 4
    ]

    # Assign HistCurrency using np.select over the conditions
    merged_df['HistCurrency'] = np.select(
        conditions,
        choices,
        default=np.nan
    )

    # If PIT Date is before Date1 but Date1 exists, assume Currency1 as well
    mask_before_first = (
        merged_df['PIT Date'].notna() &
        merged_df['Date1'].notna() &
        (merged_df['PIT Date'] < merged_df['Date1'])
    )
    merged_df.loc[mask_before_first, 'HistCurrency'] = merged_df.loc[mask_before_first, 'Currency1']

    # Drop currency helper columns (keep only HistCurrency from this merge)
    merged_df = merged_df.drop(
        columns=['Currency1', 'Date1', 'Currency2', 'Date2', 'Currency3', 'Date3'],
        errors='ignore'
    )

    # --------------------------------------------------
    # 4.3 Merge with UpdateCodes_clean
    # --------------------------------------------------

    # Normalize key columns again (in case merges changed types)
    if 'Frequency' in merged_df.columns:
        merged_df['Frequency'] = merged_df['Frequency'].astype(str).str.strip()
    if 'FiscalPeriod' in merged_df.columns:
        merged_df['FiscalPeriod'] = merged_df['FiscalPeriod'].astype(str).str.strip()

    # Merge update codes on the full key
    merged_df = pd.merge(
        merged_df,
        update_codes_to_merge,
        on=['ID', 'PIT Date', 'Frequency', 'FiscalPeriod'],
        how='left'
    )

    # --------------------------------------------------
    # 4.4 Merge with FYE_clean (if available)
    # --------------------------------------------------

    if fye_cols_to_merge is not None:
        # Normalize FiscalPeriod for the join
        if 'FiscalPeriod' in merged_df.columns:
            merged_df['FiscalPeriod'] = merged_df['FiscalPeriod'].astype(str).str.strip()

        # Merge FYE data on ID + FiscalPeriod
        merged_df = pd.merge(
            merged_df,
            fye_cols_to_merge,
            on=['ID', 'FiscalPeriod'],
            how='left'
        )
        print(f"  Merged FYE data with {ws_name}.")
    else:
        print(f"  Skipping merge with FYE_clean for {ws_name}.")

    # --------------------------------------------------
    # 4.5 Reorder columns but KEEP everything
    # --------------------------------------------------

    # Preferred order for key columns in the final file
    desired_column_order = [
        'ID',
        'CompanyName',
        'ImplCountry',
        'CurrentCurrency',
        'HistCurrency',
        'PIT Date',
        'Frequency',
        'UpdateCode',
        'FiscalPeriod',
        'FYE Month',
        'ItemCode',
        'Value'
    ]

    # Columns that exist in merged_df and appear in the desired order
    cols_in_df = [c for c in desired_column_order if c in merged_df.columns]
    # All remaining columns that were not explicitly ordered
    other_cols = [c for c in merged_df.columns if c not in cols_in_df]

    # Reorder DataFrame to have key columns first, then everything else
    merged_df = merged_df[cols_in_df + other_cols]

    # --------------------------------------------------
    # 4.6 Keep date format as 'YYYY-MM-DD' in output
    # --------------------------------------------------

    if 'PIT Date' in merged_df.columns:
        # Format PIT Date as string with 'YYYY-MM-DD'
        merged_df['PIT Date'] = merged_df['PIT Date'].dt.strftime('%Y-%m-%d')

    # Note: Currency switch dates remain only in source tables; not stored here.

    # --------------------------------------------------
    # 4.7 Save back to file
    # --------------------------------------------------

    # Overwrite the original work_subset file with the enriched data
    merged_df.to_csv(file_path, sep='|', index=False)
    print(f"    Saved updated {ws_name} to {file_path}")

    # Clean up to free memory before moving on to the next file
    del ws_df
    del merged_df
    gc.collect()

print("\nProcessing complete (currency, update codes, FYE, HistCurrency).")


Prepared CurrencyCodes_clean for merging.
Prepared UpdateCodes_clean for merging.
Prepared FYE_clean for merging.
Processing work_subset DataFrames: merging currency, update codes, FYE data, adding HistCurrency...

--- Processing work_subset_Extraordinary_Items ---
  Merged FYE data with work_subset_Extraordinary_Items.
    Saved updated work_subset_Extraordinary_Items to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/work_subset_Extraordinary_Items.txt

--- Processing work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc. ---
  Merged FYE data with work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc..
    Saved updated work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc. to /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/work_subset_Com_Pfd_Redeemed_Retired_Converted_Etc..txt

--- Processing work_subset_Net_Income_Used_to_Calculate_Basic_EPS ---
  Merged FYE data with work_subset_Net_Income_Used_to_Calculate_Basic_EPS.
    Saved updated work_subset_Net_Income

## Datastream

### Calculate Returns

In [105]:
# =============================================================================
# Compute daily returns for the full TRI dataset in memory
#   - Reads TRI_clean.txt from Temp_file_path_GO (on Google Drive).
#   - Loads the full file into memory (no chunking).
#   - Sorts by ID and DayDate, computes returns from TRI per ID.
#   - Stores returns in column 'ret_bps' in basis points with 4 decimals.
#       ret_bps_t = (TRI_t / TRI_{t-1} - 1) * 10,000
#   - Drops TRI column and writes the result to Returns_clean.txt
#     in Temp_file_path_DP with pipe separation.
#   - Ensures the output directory exists.
# =============================================================================

import os
from pathlib import Path
import pandas as pd

# -------------------------------------------------------------------------
# Path configuration: use the paths you defined in your setup cell
# -------------------------------------------------------------------------
# IMPORTANT: this must be the ORIGINAL TRI file, NOT the returns file
src_file = f"{Temp_file_path_GO}/TRI_clean.txt"   # original TRI file
dst_dir  = Temp_file_path_DP                      # output directory
Path(dst_dir).mkdir(parents=True, exist_ok=True)  # ensure output dir exists
dst_file = f"{dst_dir}/Returns_clean.txt"         # final output file path

print(f"Source file:      {src_file}")
print(f"Destination file: {dst_file}")
print("Source exists:", os.path.exists(src_file))

# -------------------------------------------------------------------------
# If an old Returns_clean.txt exists at the destination, remove it
# so we can write a fresh file.
# -------------------------------------------------------------------------
if os.path.exists(dst_file):
    os.remove(dst_file)
    print("Existing destination file removed.")
else:
    print("No existing destination file – creating a new one.")

# -------------------------------------------------------------------------
# Read full TRI file into memory
# -------------------------------------------------------------------------
df = pd.read_csv(
    src_file,
    sep="|",                  # pipe-separated input
    dtype={"ID": str},        # ensure ID is handled as string
    parse_dates=["DayDate"],  # parse DayDate as datetime
)

# -------------------------------------------------------------------------
# Sort by ID and DayDate so pct_change computes correct consecutive returns
# -------------------------------------------------------------------------
df = df.sort_values(["ID", "DayDate"])

# -------------------------------------------------------------------------
# Compute returns per ID using pct_change
# ret_bps_t = (TRI_t / TRI_{t-1} - 1) * 10,000
# -------------------------------------------------------------------------
df["ret_bps"] = (
    df.groupby("ID")["TRI"]
      .pct_change() * 10000
)

# -------------------------------------------------------------------------
# Round returns to 4 decimals as requested
# -------------------------------------------------------------------------
df["ret_bps"] = df["ret_bps"].round(4)

# -------------------------------------------------------------------------
# Drop TRI column after computing returns
# -------------------------------------------------------------------------
df = df.drop(columns=["TRI"])

# -------------------------------------------------------------------------
# Ensure that the destination directory exists right now
# (in case Drive was remounted / folder disappeared)
# -------------------------------------------------------------------------
Path(dst_dir).mkdir(parents=True, exist_ok=True)

# -------------------------------------------------------------------------
# Write full result to destination file (single shot, no append)
# -------------------------------------------------------------------------
df.to_csv(
    dst_file,
    sep="|",          # pipe-separated output
    index=False,      # no index column
    mode="w",         # overwrite
    header=True,      # write header
)

print("Finished computing returns file:")
print(dst_file)


Source file:      /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview/TRI_clean.txt
Destination file: /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/Returns_clean.txt
Source exists: True
No existing destination file – creating a new one.
Finished computing returns file:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/Returns_clean.txt


### Merge with LC_MV & USD MV

In [106]:
import os
import gc
import pandas as pd
from pathlib import Path

# =============================================================================
# SUMMARY OF THIS SCRIPT (Market Data Merge - Step 1)
# -----------------------------------------------------------------------------
# This script performs the first phase of merging disparate market data sources
# into a unified dataset.
#
# LOGIC:
# 1. Loads the Valid ID List: ensures only relevant companies are processed.
# 2. Loads Local Currency Market Values (LC_MV):
#    - Renames 'MV' column to 'MV_LC' to distinguish it from USD values later.
#    - Filters for valid IDs and standardizes dates.
# 3. Loads Stock Returns:
#    - Filters for valid IDs and standardizes dates.
# 4. Performs an INNER JOIN on [ID, DayDate]:
#    - Creates a dataset containing only rows where BOTH Local Currency Market Value
#      and Stock Return data exist for that specific day.
# 5. Saves the intermediate result to disk ("MarketData_Step1.txt") to free up RAM
#    before the next merge step.
# =============================================================================

# =============================================================================
# CONFIGURATION
# =============================================================================
# Inputs
lc_mv_file  = f"{Temp_file_path_GO}/LC_MV_clean.txt"          
ret_file    = f"{Temp_file_path_DP}/Returns_clean.txt"     
id_file     = f"{Temp_file_path_EoC}/ID_clean.txt"         

# Output
dst_dir     = Temp_file_path_DP                            
Path(dst_dir).mkdir(parents=True, exist_ok=True)
intermediate_file = f"{dst_dir}/MarketData_Step1.txt"        

# =============================================================================
# 1. LOAD IDs
# =============================================================================
print("1. Loading ID list...")
id_df = pd.read_csv(id_file, sep="|", dtype=str, usecols=["ID"])
valid_ids = set(id_df["ID"])
del id_df
gc.collect()

# =============================================================================
# 2. LOAD LC MV FILE (Rename MV -> MV_LC)
# =============================================================================
print("2. Loading LC MV file...")
mv_lc_df = pd.read_csv(
    lc_mv_file, 
    sep="|", 
    dtype={"ID": "string", "MV": "float32", "PCUR": "string"},
    usecols=["ID", "DayDate", "MV", "PCUR"]
)

# Rename MV to MV_LC immediately
mv_lc_df.rename(columns={"MV": "MV_LC"}, inplace=True)

# Filter & Convert
mv_lc_df = mv_lc_df[mv_lc_df["ID"].isin(valid_ids)]
mv_lc_df["DayDate"] = pd.to_datetime(mv_lc_df["DayDate"], format="mixed", errors="coerce")

# =============================================================================
# 3. LOAD RETURNS FILE
# =============================================================================
print("3. Loading Returns file...")
ret_df = pd.read_csv(
    ret_file, 
    sep="|", 
    dtype={"ID": "string", "ret_bps": "float32"}, 
    usecols=["ID", "DayDate", "ret_bps"]
)

# Filter & Convert
ret_df = ret_df[ret_df["ID"].isin(valid_ids)]
ret_df["DayDate"] = pd.to_datetime(ret_df["DayDate"], format="mixed", errors="coerce")

# =============================================================================
# 4. MERGE STEP 1
# =============================================================================
print("4. Merging LC MV + Returns...")
step1_df = pd.merge(mv_lc_df, ret_df, on=["ID", "DayDate"], how="inner")

print(f"   Step 1 Rows: {len(step1_df):,}")

# Save intermediate result
step1_df.to_csv(intermediate_file, sep="|", index=False)
print(f"   Saved intermediate file to: {intermediate_file}")

# Clean up
del mv_lc_df, ret_df, step1_df
gc.collect()

1. Loading ID list...
2. Loading LC MV file...
3. Loading Returns file...
4. Merging LC MV + Returns...
   Step 1 Rows: 207,708,087
   Saved intermediate file to: /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/MarketData_Step1.txt


0

In [107]:
import os
import gc
import pandas as pd

# =============================================================================
# SUMMARY OF THIS SCRIPT (Market Data Merge - Step 2)
# -----------------------------------------------------------------------------
# This script performs the final phase of the data merge.
#
# LOGIC:
# 1. Loads the Intermediate Dataset (LC MV + Returns) from Step 1.
# 2. Loads USD Market Values (MV_USD).
# 3. Performs a LEFT JOIN to add USD data.
# 4. [NEW] FILTERS CLEAN: Removes ANY row that has a missing value in:
#      - MV_LC (Local Currency Market Value)
#      - ret_bps (Return in basis points)
#      - MV_USD (USD Market Value)
#    This ensures the final dataset is "complete" (no empty cells in key columns).
# 5. Saves the final consolidated file ("MarketData_pre_clean.txt").
# =============================================================================

# =============================================================================
# CONFIGURATION
# =============================================================================
# Inputs
usd_mv_file = f"{Temp_file_path_GO}/MV_clean.txt"
input_file  = f"{Temp_file_path_DP}/MarketData_Step1.txt" # Result from Cell 1

# Output
final_file  = f"{Temp_file_path_DP}/MarketData_pre_clean.txt"

# =============================================================================
# 1. LOAD STEP 1 DATASET
# =============================================================================
print("1. Loading Step 1 Dataset...")
main_df = pd.read_csv(
    input_file,
    sep="|",
    # Explicit types to keep memory usage low
    dtype={"ID": "string", "MV_LC": "float32", "ret_bps": "float32", "PCUR": "string"}
)
main_df["DayDate"] = pd.to_datetime(main_df["DayDate"], format="mixed", errors="coerce")
print(f"   Loaded rows: {len(main_df):,}")

# =============================================================================
# 2. LOAD USD MV FILE (Rename MV -> MV_USD)
# =============================================================================
print("2. Loading USD MV file...")
mv_usd_df = pd.read_csv(
    usd_mv_file, 
    sep="|", 
    dtype={"ID": "string", "MV": "float32"},
    usecols=["ID", "DayDate", "MV"]
)

# Rename MV to MV_USD immediately
mv_usd_df.rename(columns={"MV": "MV_USD"}, inplace=True)

# Convert Date
mv_usd_df["DayDate"] = pd.to_datetime(mv_usd_df["DayDate"], format="mixed", errors="coerce")

# =============================================================================
# 3. MERGE STEP 2 (LEFT JOIN)
# =============================================================================
print("3. Merging with USD Data (Left Join)...")

final_df = pd.merge(main_df, mv_usd_df, on=["ID", "DayDate"], how="left")
print(f"   Rows after merge: {len(final_df):,}")

# =============================================================================
# 4. [NEW] FILTER MISSING VALUES
# =============================================================================
print("4. Removing rows with missing values...")

rows_before = len(final_df)

# Drop rows where ANY of these specific columns are NaN
cols_to_check = ["MV_LC", "ret_bps", "MV_USD"]
final_df.dropna(subset=cols_to_check, inplace=True)

rows_after = len(final_df)
dropped_count = rows_before - rows_after

print(f"   - Rows before filter: {rows_before:,}")
print(f"   - Rows dropped:       {dropped_count:,}")
print(f"   - Rows remaining:     {rows_after:,}")

# =============================================================================
# 5. SAVE FINAL DATASET
# =============================================================================
if not final_df.empty:
    print(f"5. Saving Final Dataset ({len(final_df):,} rows)...")
    final_df.to_csv(final_file, sep="|", index=False)
    
    # Preview
    print("\nPreview of Final Columns:")
    print(final_df.columns.tolist())
    print(final_df.head())
else:
    print("Warning: All rows were dropped! Check if the datasets actually overlap.")

# Clean up
del main_df, mv_usd_df, final_df
gc.collect()

1. Loading Step 1 Dataset...
   Loaded rows: 207,708,087
2. Loading USD MV file...
3. Merging with USD Data (Left Join)...
   Rows after merge: 208,304,865
4. Removing rows with missing values...
   - Rows before filter: 208,304,865
   - Rows dropped:       57,637,159
   - Rows remaining:     150,667,706
5. Saving Final Dataset (150,667,706 rows)...

Preview of Final Columns:
['ID', 'PCUR', 'DayDate', 'MV_LC', 'ret_bps', 'MV_USD']
             ID PCUR    DayDate      MV_LC  ret_bps  MV_USD
2723  C025L1660   AP 2003-12-31  81.379997   0.0671   27.75
2724  C025L1660   AP 2003-12-31  81.379997   0.0000   27.75
2725  C025L1660   AP 2003-12-31  81.379997   0.0671   27.75
2726  C025L1660   AP 2003-12-31  81.379997   0.0000   27.75
2727  C025L1660   AP 2004-01-01  81.379997   0.0000   27.75


0

### Get Summary of Data for Table 2 from the Original Paper (Indication, not Final!)

In [108]:
# =============================================================================
# CELL 1: SUMMARY STATISTICS (WITH MV AND RETURN THRESHOLD FILTERING)
# -----------------------------------------------------------------------------
# 1) Loads the full merged MarketData_pre_clean.txt file.
# 2) **FILTERING STEP**:
#       - Identifies rows where MV > 5,000,000.
#       - Identifies rows where Return > 1000% (converted to 100,000 bps).
#       - Excludes these rows from ALL subsequent statistics.
#       - Tracks the count of exclusions for both criteria.
# 3) Computes statistics (Mean, Std, Median, Percentiles) on the filtered data.
# 4) Performs ID-level winsorization (configurable).
# 5) Saves Summary and Distribution tables to Excel.
# =============================================================================

import numpy as np
import pandas as pd
import os

# =============================================================================
# CONFIGURATION
# =============================================================================

# Paths
merged_file_path       = f"{Temp_file_path_DP}/MarketData_pre_clean.txt"
stats_output_dir       = Temp_file_path_DP
stats_output_name_xlsx = "MarketData_summary.xlsx"
stats_output_path_xlsx = f"{stats_output_dir}/{stats_output_name_xlsx}"

# Filtering Thresholds
mv_filter_threshold          = 5_000_000   # Exclude if MV > 5,000,000
ret_filter_threshold_percent = 1000        # Exclude if Return > 1000%
ret_filter_threshold_bps     = ret_filter_threshold_percent * 100  # Convert % to bps

# Winsorization configuration
winsorize_data   = True      # set to False to skip winsorization
winsor_alpha     = 0.01      # 1% winsorization in each tail
winsor_id_column = "ID"      # column name for ID-level winsorization

# Distribution quantiles configuration (in percent)
distribution_percentiles = [0, 1, 5, 10, 20, 30, 40, 50,
                            60, 70, 80, 90, 95, 99, 100]

# Create index labels for rows (min, p01, ... max)
dist_index_labels = []
for p in distribution_percentiles:
    if p == 0:
        dist_index_labels.append("min")
    elif p == 100:
        dist_index_labels.append("max")
    else:
        dist_index_labels.append(f"p{int(p):02d}")

# =============================================================================
# SAFETY CHECK
# =============================================================================

if not os.path.exists(merged_file_path):
    raise FileNotFoundError(f"Merged file not found at: {merged_file_path}")

print("Using merged file:")
print(merged_file_path)


# =============================================================================
# DATA LOADING AND FILTERING
# =============================================================================

print("\n--- Loading Data and Applying Filters ---")

cols_to_load = ["MV_USD", "ret_bps"]
if winsorize_data:
    cols_to_load.append(winsor_id_column)
cols_to_load = list(set(cols_to_load))

df_full = pd.read_csv(
    merged_file_path,
    sep="|",
    usecols=cols_to_load
)

total_rows_loaded = len(df_full)
print(f"Total rows loaded: {total_rows_loaded:,}")

# 1. Identify High MV rows
high_mv_mask = df_full["MV_USD"] > mv_filter_threshold
count_high_mv = high_mv_mask.sum()

# 2. Identify High Return rows (> 1000% -> > 100,000 bps)
high_ret_mask = df_full["ret_bps"] > ret_filter_threshold_bps
count_high_ret = high_ret_mask.sum()

# 3. Combine filters (Exclude if EITHER is true)
combined_exclusion_mask = high_mv_mask | high_ret_mask
total_dropped = combined_exclusion_mask.sum()

# Apply Filter
df_working = df_full[~combined_exclusion_mask].copy()
remaining_rows = len(df_working)

print(f"Filter Thresholds: MV_USD > {mv_filter_threshold:,.0f} | Return > {ret_filter_threshold_bps:,} bps ({ret_filter_threshold_percent}%)")
print(f" - Rows with High MV: {count_high_mv:,}")
print(f" - Rows with High Return: {count_high_ret:,}")
print(f" - Total Unique Rows Dropped: {total_dropped:,}")
print(f"Rows remaining for analysis: {remaining_rows:,}")


# =============================================================================
# HELPER FUNCTION: Analyze a single numeric column (on filtered data)
# =============================================================================

def analyze_column(df_in: pd.DataFrame, col_name: str):
    print(f"\n--- Analyzing column: {col_name} ---")

    # Clean numeric column: replace +/- inf with NaN
    series_data = df_in[col_name].replace([np.inf, -np.inf], np.nan)

    if winsorize_data:
        temp_df = pd.DataFrame({
            col_name: series_data,
            winsor_id_column: df_in[winsor_id_column]
        })
        valid = temp_df.dropna(subset=[col_name])
    else:
        valid = series_data.dropna().to_frame(name=col_name)

    if valid.empty:
        print(f"WARNING: No valid observations found in {col_name} after cleaning.")
        return {k: np.nan for k in ["count", "mean_raw", "std_raw", "median_raw", "winsor_mean"]} | {"raw_q": None, "winsor_q": None}

    values_all = valid[col_name].to_numpy()

    # Raw statistics
    count = values_all.size
    mean_raw = values_all.mean()
    std_raw = values_all.std(ddof=1) if count > 1 else np.nan
    median_raw = np.median(values_all)
    raw_q = np.percentile(values_all, distribution_percentiles)

    # Winsorized statistics
    winsor_mean = np.nan
    winsor_q = None

    if winsorize_data:
        print(f"  Performing ID-level winsorization for {col_name}...")
        lower_q = valid.groupby(winsor_id_column)[col_name].transform(lambda x: x.quantile(winsor_alpha))
        upper_q = valid.groupby(winsor_id_column)[col_name].transform(lambda x: x.quantile(1.0 - winsor_alpha))
        winsor_vals = valid[col_name].clip(lower=lower_q, upper=upper_q)
        winsor_mean = winsor_vals.mean()
        winsor_q = np.percentile(winsor_vals.to_numpy(), distribution_percentiles)
    else:
        print(f"  Winsorization disabled for {col_name}.")

    return {
        "count": count,
        "mean_raw": mean_raw,
        "std_raw": std_raw,
        "median_raw": median_raw,
        "raw_q": raw_q,
        "winsor_mean": winsor_mean,
        "winsor_q": winsor_q
    }


# =============================================================================
# RUN ANALYSIS
# =============================================================================

mv_results = analyze_column(df_working, "MV_USD")
ret_results = analyze_column(df_working, "ret_bps")


# =============================================================================
# BUILD SUMMARY DATAFRAME
# =============================================================================

stats_df = pd.DataFrame({
    "Column": ["MV_USD", "ret_bps"],
    "Observations_Used": [mv_results["count"], ret_results["count"]],
    "Excluded_High_MV_USD": [count_high_mv, count_high_mv],
    "Excluded_High_Ret": [count_high_ret, count_high_ret],
    "Total_Rows_Dropped": [total_dropped, total_dropped],
    "Mean_Raw": [mv_results["mean_raw"], ret_results["mean_raw"]],
    "Std_Dev_Raw": [mv_results["std_raw"], ret_results["std_raw"]],
    "Median_Raw": [mv_results["median_raw"], ret_results["median_raw"]],
    "Mean_Winsorized": [mv_results["winsor_mean"], ret_results["winsor_mean"]],
    "Winsorization_Alpha": [winsor_alpha if winsorize_data else np.nan] * 2
})


# =============================================================================
# BUILD DISTRIBUTION TABLES
# =============================================================================

if mv_results["raw_q"] is not None and ret_results["raw_q"] is not None:
    raw_distribution_df = pd.DataFrame(
        {"MV_USD_raw": mv_results["raw_q"], "ret_bps_raw": ret_results["raw_q"]},
        index=dist_index_labels
    )
else:
    raw_distribution_df = pd.DataFrame()

winsor_distribution_df = None
if winsorize_data and (mv_results["winsor_q"] is not None) and (ret_results["winsor_q"] is not None):
    winsor_distribution_df = pd.DataFrame(
        {"MV_USD_winsorized": mv_results["winsor_q"], "ret_bps_winsorized": ret_results["winsor_q"]},
        index=dist_index_labels
    )


# =============================================================================
# SAVE RESULTS TO EXCEL
# =============================================================================

os.makedirs(stats_output_dir, exist_ok=True)

with pd.ExcelWriter(stats_output_path_xlsx) as writer:
    stats_df.to_excel(writer, sheet_name="Summary", index=False)
    raw_distribution_df.to_excel(writer, sheet_name="Raw_Distribution", index=True)
    if winsor_distribution_df is not None:
        winsor_distribution_df.to_excel(writer, sheet_name="Winsorized_Distribution", index=True)

print("\n=== SUMMARY STATISTICS (Post-Filtering) ===")
print(stats_df)
print("\nExcel summary successfully saved to:")
print(stats_output_path_xlsx)

Using merged file:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/MarketData_pre_clean.txt

--- Loading Data and Applying Filters ---
Total rows loaded: 150,667,706
Filter Thresholds: MV_USD > 5,000,000 | Return > 100,000 bps (1000%)
 - Rows with High MV: 3,289
 - Rows with High Return: 12,898
 - Total Unique Rows Dropped: 16,187
Rows remaining for analysis: 150,651,519

--- Analyzing column: MV_USD ---
  Performing ID-level winsorization for MV_USD...

--- Analyzing column: ret_bps ---
  Performing ID-level winsorization for ret_bps...

=== SUMMARY STATISTICS (Post-Filtering) ===
    Column  Observations_Used  Excluded_High_MV_USD  Excluded_High_Ret  \
0   MV_USD          150651519                  3289              12898   
1  ret_bps          150651519                  3289              12898   

   Total_Rows_Dropped     Mean_Raw   Std_Dev_Raw  Median_Raw  Mean_Winsorized  \
0               16187  1911.139135  17954.744053      157.05      1908.180445   
1          

### Winsorize Returns (1% per ID)

In [109]:
# =============================================================================
# CREATE FULLY CLEANED & WINSORIZED DATASET + SUMMARY
# -----------------------------------------------------------------------------
# 1) Loads the entire MarketData_pre_clean.txt file into memory.
# 2) **FILTERING STEP**:
#       - Removes rows where MV > 5,000,000.
#       - Removes rows where Return > 1000% (converted to basis points).
#       - These rows are physically deleted from the dataframe.
# 3) Performs ID-level winsorization at 1% in each tail for MV and ret_bps
#    on the remaining data.
# 4) Writes the fully cleaned & winsorized dataset to a new text file.
# 5) Computes summary statistics and distribution tables for the winsorized
#    columns only.
# 6) Saves these results into an Excel file.
# =============================================================================

import numpy as np
import pandas as pd
import os

# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------

# Input file: uses the same path as in Cell 1
input_file_path = merged_file_path  # relies on variable defined in Cell 1

# Output file for the full cleaned & winsorized dataset
winsorized_file_path = f"{Temp_file_path_DP}/MarketData_clean.txt"

# Excel summary for the winsorized data
winsor_stats_output_dir       = Temp_file_path_DP
winsor_stats_output_name_xlsx = "MarketData_winsorized_summary.xlsx"
winsor_stats_output_path_xlsx = f"{winsor_stats_output_dir}/{winsor_stats_output_name_xlsx}"

# Filtering Thresholds (Must match previous cell logic)
mv_filter_threshold          = 5_000_000   # Exclude if MV > 5,000,000
ret_filter_threshold_percent = 1000        # Exclude if Return > 1000%
ret_filter_threshold_bps     = ret_filter_threshold_percent * 100

# Winsorization configuration
winsor_alpha     = 0.01                 # 1% in each tail
winsor_id_column = "ID"                 # ID-level winsorization
target_columns   = ["MV_USD", "ret_bps"]    # columns to winsorize

# Distribution quantiles configuration (in percent)
distribution_percentiles = [0, 1, 5, 10, 20, 30, 40, 50,
                            60, 70, 80, 90, 95, 99, 100]

# Create index labels for rows
dist_index_labels = []
for p in distribution_percentiles:
    if p == 0:
        dist_index_labels.append("min")
    elif p == 100:
        dist_index_labels.append("max")
    else:
        dist_index_labels.append(f"p{int(p):02d}")

# -----------------------------------------------------------------------------
# SAFETY CHECK
# -----------------------------------------------------------------------------

if not os.path.exists(input_file_path):
    raise FileNotFoundError(f"Input file not found at: {input_file_path}")

print("Processing input file:")
print(input_file_path)

os.makedirs(winsor_stats_output_dir, exist_ok=True)

# =============================================================================
# STEP 1: LOAD FULL DATASET
# =============================================================================

df = pd.read_csv(
    input_file_path,
    sep="|"
)
original_row_count = len(df)
print(f"Original row count: {original_row_count:,}")

# =============================================================================
# STEP 1.5: APPLY FILTERS AND REMOVE ROWS
# =============================================================================

print("\n--- Applying Exclusion Filters ---")

# Identify rows to drop
# We use fillna(False) so we don't accidentally drop NaNs here (NaNs handled in winsorization)
high_mv_mask = df["MV_USD"] > mv_filter_threshold
high_ret_mask = df["ret_bps"] > ret_filter_threshold_bps

# Combine masks
rows_to_drop_mask = high_mv_mask | high_ret_mask
drop_count = rows_to_drop_mask.sum()

# Perform the drop
df = df[~rows_to_drop_mask].copy()
new_row_count = len(df)

print(f"Thresholds: MV_USD > {mv_filter_threshold:,.0f} | Return > {ret_filter_threshold_bps:,} bps")
print(f"Rows dropped: {drop_count:,}")
print(f"Rows remaining: {new_row_count:,}")

# =============================================================================
# STEP 2: APPLY ID-LEVEL WINSORIZATION TO TARGET COLUMNS
# =============================================================================

for col in target_columns:
    if col not in df.columns:
        print(f"Column {col} not found in input data. Skipping.")
        continue

    print(f"\n--- Winsorizing column: {col} ---")

    # Clean numeric column: replace +/- inf with NaN
    df[col] = df[col].replace([np.inf, -np.inf], np.nan)

    # Compute lower and upper quantiles per ID on valid values only
    lower_q = df.groupby(winsor_id_column)[col].transform(
        lambda x: x.quantile(winsor_alpha)
    )
    upper_q = df.groupby(winsor_id_column)[col].transform(
        lambda x: x.quantile(1.0 - winsor_alpha)
    )

    # Winsorize within [lower_q, upper_q] per ID.
    mask_valid_vals = df[col].notna()
    bounds_defined = mask_valid_vals & lower_q.notna() & upper_q.notna()

    # Initialize with original values
    winsor_col = df[col].copy()

    # Apply clipping only where both bounds are defined
    winsor_col.loc[bounds_defined] = winsor_col.loc[bounds_defined].clip(
        lower=lower_q.loc[bounds_defined],
        upper=upper_q.loc[bounds_defined]
    )

    df[col] = winsor_col

print("\nFull winsorized dataset created in memory.")

# =============================================================================
# STEP 3: SAVE THE FULL CLEANED & WINSORIZED DATASET
# =============================================================================

df.to_csv(
    winsorized_file_path,
    sep="|",
    index=False
)

print("Cleaned and winsorized dataset saved to:")
print(winsorized_file_path)

# =============================================================================
# STEP 4: COMPUTE SUMMARY STATISTICS AND DISTRIBUTION FOR WINSORIZED DATA
# =============================================================================

summary_rows = []
dist_data = {}

for col in target_columns:
    if col not in df.columns:
        continue

    print(f"\n--- Computing winsorized statistics for: {col} ---")

    valid_vals = df[col].dropna()
    if valid_vals.empty:
        # If filtering removed everything, handle gracefully
        print(f"Warning: No valid observations left for {col}.")
        continue

    count      = valid_vals.size
    mean_w     = valid_vals.mean()
    std_w      = valid_vals.std(ddof=1) if count > 1 else np.nan
    median_w   = valid_vals.median()
    q_vals     = np.percentile(valid_vals.to_numpy(), distribution_percentiles)

    summary_rows.append({
        "Column": col,
        "Observations": count,
        "Rows_Dropped_Filter": drop_count,
        "Mean_Winsorized": mean_w,
        "Std_Dev_Winsorized": std_w,
        "Median_Winsorized": median_w,
        "Winsorization_Alpha": winsor_alpha,
        "Winsorization_By_ID_Column": winsor_id_column
    })

    dist_data[f"{col}_winsorized"] = q_vals

# Summary DataFrame
winsor_stats_df = pd.DataFrame(summary_rows)

# Distribution DataFrame
if dist_data:
    winsor_distribution_df = pd.DataFrame(
        dist_data,
        index=dist_index_labels
    )
else:
    winsor_distribution_df = pd.DataFrame()

# =============================================================================
# STEP 5: SAVE SUMMARY TO EXCEL
# =============================================================================

with pd.ExcelWriter(winsor_stats_output_path_xlsx) as writer:
    winsor_stats_df.to_excel(writer, sheet_name="Summary_Winsorized", index=False)
    winsor_distribution_df.to_excel(writer, sheet_name="Distribution_Winsorized", index=True)

print("\n=== WINSORIZED SUMMARY STATISTICS ===")
print(winsor_stats_df)

print("\n=== WINSORIZED DISTRIBUTION ===")
print(winsor_distribution_df)

print("\nWinsorized summary Excel successfully saved to:")
print(winsor_stats_output_path_xlsx)

Processing input file:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/MarketData_pre_clean.txt
Original row count: 150,667,706

--- Applying Exclusion Filters ---
Thresholds: MV_USD > 5,000,000 | Return > 100,000 bps
Rows dropped: 16,187
Rows remaining: 150,651,519

--- Winsorizing column: MV_USD ---

--- Winsorizing column: ret_bps ---

Full winsorized dataset created in memory.
Cleaned and winsorized dataset saved to:
/home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/MarketData_clean.txt

--- Computing winsorized statistics for: MV_USD ---

--- Computing winsorized statistics for: ret_bps ---

=== WINSORIZED SUMMARY STATISTICS ===
    Column  Observations  Rows_Dropped_Filter  Mean_Winsorized  \
0   MV_USD     150651519                16187      1908.180445   
1  ret_bps     150651519                16187         5.417396   

   Std_Dev_Winsorized  Median_Winsorized  Winsorization_Alpha  \
0        17876.494586             157.01                 0.01   
1

### Check ID Difference

 => Comment: Difference due to non-overlapping Dates in Returns_clean and MV_clean

In [110]:
import pandas as pd
import os

mv_file  = f"{Temp_file_path_GO}/MV_clean.txt"
ret_file = f"{Temp_file_path_DP}/Returns_clean.txt"
md_file  = f"{Temp_file_path_DP}/MarketData_clean.txt"
id_file  = f"{Temp_file_path_EoC}/ID_clean.txt"

# All valid IDs from ID_clean
ids_ref = set(
    pd.read_csv(id_file, sep="|", usecols=["ID"], dtype={"ID": "string"})["ID"]
)

# IDs in the merged MarketData_clean
ids_md = set(
    pd.read_csv(md_file, sep="|", usecols=["ID"], dtype={"ID": "string"})["ID"]
)

missing_ids = ids_ref - ids_md
print(len(missing_ids))
print(sorted(missing_ids))

764
['C036000A0', 'C036001A0', 'C036001G0', 'C036001I0', 'C036001T0', 'C036002V0', 'C03601380', 'C03607230', 'C03607C00', 'C03617890', 'C0361C100', 'C03622200', 'C03627960', 'C03640640', 'C03642D00', 'C03655920', 'C03656110', 'C03656130', 'C03659670', 'C0365F500', 'C0365LF00', 'C03661390', 'C03665920', 'C03666170', 'C03667450', 'C03668570', 'C0367RB00', 'C0367S800', 'C03682060', 'C0368V500', 'C03694900', 'C036A1720', 'C036A2A00', 'C036BT320', 'C036BY900', 'C036CA510', 'C036CD530', 'C036CH450', 'C036CT200', 'C036E0280', 'C036EB500', 'C036EHB00', 'C036F00E0', 'C036F03W0', 'C036F05N0', 'C036F13G0', 'C036F4090', 'C036F41K0', 'C036F4520', 'C036F46B0', 'C036F49L0', 'C036F5160', 'C036F5170', 'C036F60T0', 'C036F6170', 'C036F6230', 'C036F6250', 'C036F62O0', 'C036F6780', 'C036F67J0', 'C036F6840', 'C036F68Q0', 'C036F69H0', 'C036F7220', 'C036F7440', 'C036F7660', 'C036F7840', 'C036F7930', 'C036F7D00', 'C036F8310', 'C036F8380', 'C036F8530', 'C036F8590', 'C036F9280', 'C036F9360', 'C036F9740', 'C036FH

In [111]:
# =============================================================================
# SUMMARY
# -------
# This debugging snippet helps you investigate *why* certain IDs appear in
# `missing_ids` — meaning they were found in MV_clean.txt but not in
# Returns_clean.txt.
#
# What it does:
#   1) Picks one example ID from the set `missing_ids`
#   2) Loads only the necessary columns from MV and Returns
#   3) Filters both datasets to this single ID
#   4) Prints the sorted list of dates available for:
#        - MV (DayDate)
#        - Returns (DayDate)
#
# This makes it easy to check whether:
#   - The ID truly has no matching dates,
#   - There are mismatched date formats,
#   - Or Returns simply doesn't contain this ID at all.
# =============================================================================

# Only proceed if there are actual missing IDs to debug
if missing_ids:
    # Pick one example ID from the set of IDs that exist in MV but do not match Returns
    example_id = next(iter(missing_ids))

    # Load selected columns from MV file (keeps memory usage low)
    mv_example = pd.read_csv(
        mv_file,
        sep="|",
        usecols=["ID", "DayDate", "MV"],
        dtype={"ID": "string", "DayDate": "string"},
    )

    # Filter MV data to only the chosen ID
    mv_example = mv_example[mv_example["ID"] == example_id]

    # Load selected columns from Returns file
    ret_example = pd.read_csv(
        ret_file,
        sep="|",
        usecols=["ID", "DayDate", "ret_bps"],
        dtype={"ID": "string", "DayDate": "string"},
    )

    # Filter Returns data to the same ID
    ret_example = ret_example[ret_example["ID"] == example_id]

    # Print available date values for this ID in both datasets
    print("MV dates:", sorted(mv_example["DayDate"].unique()))
    print("Returns dates:", sorted(ret_example["DayDate"].unique()))
else:
    print("No missing IDs found to debug. The 'missing_ids' set is empty.")


MV dates: ['2024-11-20', '2024-11-21', '2024-11-22', '2024-11-25', '2024-11-26', '2024-11-27', '2024-11-28', '2024-11-29', '2024-12-02', '2024-12-03', '2024-12-04', '2024-12-05', '2024-12-06', '2024-12-09', '2024-12-10', '2024-12-11', '2024-12-12', '2024-12-13', '2024-12-16', '2024-12-17', '2024-12-18', '2024-12-19', '2024-12-20', '2024-12-23', '2024-12-24', '2024-12-25', '2024-12-26', '2024-12-27', '2024-12-30', '2024-12-31', '2025-01-01', '2025-01-02', '2025-01-03', '2025-01-06', '2025-01-07', '2025-01-08', '2025-01-09', '2025-01-10', '2025-01-13', '2025-01-14', '2025-01-15', '2025-01-16', '2025-01-17', '2025-01-20', '2025-01-21', '2025-01-22', '2025-01-23', '2025-01-24', '2025-01-27', '2025-01-28', '2025-01-29', '2025-01-30', '2025-01-31', '2025-02-03', '2025-02-04', '2025-02-05', '2025-02-06', '2025-02-07', '2025-02-10', '2025-02-11', '2025-02-12', '2025-02-13', '2025-02-14', '2025-02-17']
Returns dates: ['2024-11-20', '2024-11-21', '2024-11-22', '2024-11-25', '2024-11-26', '2024-1

### Track IDs Dropping

In [112]:
# SUMMARY:
# This script takes multiple '|'-separated text files, extracts ONLY the ID column
# (for efficiency), combines all unique IDs across all datasets, and produces an
# Excel file showing which ID appears in which dataset. The Excel contains one row
# per unique ID and one column per input file, marked with "x" if the ID exists in it.

import os
import pandas as pd

# --------- INPUT SECTION ---------
# List all your input txt files here
file_paths = [
    os.path.join(Temp_file_path_DP, "MarketData_clean.txt"),
    os.path.join(Temp_file_path_DP, "Returns_clean.txt"),
    os.path.join(Temp_file_path_GO, "TRI_clean.txt"),
    os.path.join(Temp_file_path_GO, "MV_clean.txt"),
    os.path.join(Temp_file_path_GO, "LC_MV_clean.txt"),
    os.path.join(Temp_file_path_GO, "ID_mapping_clean.txt"),
    os.path.join(Temp_file_path_EoC, "filtered_ids.txt"),
    os.path.join(Temp_file_path_EoC, "ID_clean.txt"),
    os.path.join(Temp_file_path_EoC, "ADR_clean.txt"),
]

id_col = "ID"  # <-- change this if your column name is different
output_excel_path = os.path.join(Temp_file_path_DP, "ID_Drop_Tracking.xlsx")
# ---------------------------------


# 1) Read only the ID column from each file and store as sets
id_sets = {}  # key: dataset name, value: set of IDs

for path in file_paths:
    dataset_name = os.path.splitext(os.path.basename(path))[0]
    print(f"Processing {dataset_name} from {path}...")

    try:
        # Attempt to read with specified ID column name
        ids = pd.read_csv(path, sep="|", usecols=[id_col])[id_col]
        print(f"  Successfully read '{id_col}' column.")
    except ValueError as e:
        # If 'ID' column not found, try reading without header and assume first column is ID
        print(f"  Warning: '{id_col}' column not found in '{path}'. Attempting to read as single column without header.")
        try:
            temp_df = pd.read_csv(path, sep="|", header=None)
            # Assuming the first column (index 0) contains the IDs
            if not temp_df.empty:
                ids = temp_df.iloc[:, 0]
                print(f"  Successfully read first column as IDs.")
            else:
                ids = pd.Series([]) # Empty Series if file is empty
                print(f"  File '{path}' is empty, no IDs to process.")
        except Exception as inner_e:
            print(f"  Error reading '{path}' as single column: {inner_e}. Skipping this file.")
            continue # Skip to the next file if even this fallback fails

    id_sets[dataset_name] = set(ids.dropna())  # drop NaNs just in case
    print(f"  Loaded {len(id_sets[dataset_name])} unique IDs for {dataset_name}.")


# 2) Compute the union of all IDs
all_ids = sorted(set().union(*id_sets.values()))

# 3) Build presence matrix: rows = IDs, columns = datasets, values = "x"/""
presence_df = pd.DataFrame(index=all_ids)

for dataset_name, ids in id_sets.items():
    presence_df[dataset_name] = presence_df.index.to_series().isin(ids).map(
        lambda present: "x" if present else ""
    )

# Move ID from index to a proper column for nicer Excel output
presence_df.index.name = id_col
presence_df.reset_index(inplace=True)

# 4) Save to Excel
presence_df.to_excel(output_excel_path, index=False)

print(f"Done. Overview saved to: {output_excel_path}")


Processing MarketData_clean from /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/MarketData_clean.txt...
  Successfully read 'ID' column.
  Loaded 54603 unique IDs for MarketData_clean.
Processing Returns_clean from /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/Returns_clean.txt...
  Successfully read 'ID' column.
  Loaded 85918 unique IDs for Returns_clean.
Processing TRI_clean from /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview/TRI_clean.txt...
  Successfully read 'ID' column.
  Loaded 85918 unique IDs for TRI_clean.
Processing MV_clean from /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview/MV_clean.txt...
  Successfully read 'ID' column.
  Loaded 72079 unique IDs for MV_clean.
Processing LC_MV_clean from /home/jovyan/work/hpool1/pseidel/test/Temp/TempGeneralOverview/LC_MV_clean.txt...
  Successfully read 'ID' column.
  Loaded 85049 unique IDs for LC_MV_clean.
Processing ID_mapping_clean from /home/jovyan/work/hpool1/pseide

### Merge with US One Month Risk Free Rate (Daily)

In [254]:
# =============================================================================
# NON-BUCKETED MERGE (Risk-Free Rate Only) WITH ID FILTERING:
#   MarketData_clean.txt (pipe, has DayDate = YYYY-MM-DD)
#   FF CSV (comma, has a date column "date" and "rf")
#
# Result: FF_Benchmark_data_clean.txt (pipe-separated)
#
# Features:
#   - No bucketing, full in-memory merge.
#   - ONLY the 'rf' column is merged from the FF file.
#   - 'rf' is forward-filled for all Market dates (using merge_asof).
#   - ADDS 'Country' column derived from 'ID' (2nd to 4th digit).
#   - FILTERS: Drops countries with 30 or fewer unique IDs (logs dropped countries).
#   - Final output does NOT contain the FF "date" column.
# =============================================================================

import os
import gc
import csv
import pandas as pd

# =====================================================================
# CONFIGURATION
# =====================================================================
ff_csv_file = f'{Input_file_path}/FF_5F+Mom+RrR.csv'  # FF Factors
dst_dir     = Temp_file_path_DP                       # same as in your benchmark script
market_file = f"{dst_dir}/MarketData_clean.txt"       # output of your benchmark script
out_file    = f"{dst_dir}/FF_Benchmark_data_clean.txt"      # final merged file

FF_DATE_COL = "date"  # correct column name in FF CSV
TARGET_COL  = "rf"    # We only want to merge this column

print(f"FF CSV file:    {ff_csv_file}")
print(f"Market file:    {market_file}")
print(f"Output file:    {out_file}")
print("FF CSV exists:", os.path.exists(ff_csv_file))
print("Market exists:", os.path.exists(market_file))

# =====================================================================
# DATE NORMALISATION
# =====================================================================
def normalize_date(s: str):
    """
    Normalize date strings to YYYY-MM-DD if possible.
    Handles:
      - YYYY-MM-DD (unchanged)
      - YYYYMMDD  (FF style) -> YYYY-MM-DD
    """
    if s is None:
        return None
    s = s.strip()
    if not s:
        return None

    # Already looks like YYYY-MM-DD
    if "-" in s and len(s) == 10:
        return s

    # Plain YYYYMMDD digits
    if len(s) == 8 and s.isdigit():
        return f"{s[0:4]}-{s[4:6]}-{s[6:8]}"

    # Fallback: return as-is
    return s

# =====================================================================
# STEP 1: Read headers of both files (for logging)
# =====================================================================
with open(market_file, "r", newline="") as f_mkt:
    mkt_reader = csv.reader(f_mkt, delimiter="|")
    mkt_header = next(mkt_reader)

with open(ff_csv_file, "r", newline="") as f_ff:
    ff_reader = csv.reader(f_ff, delimiter=",")
    ff_header = next(ff_reader)

print("\nMarketData header:", mkt_header)
print("FF CSV header:", ff_header)

if FF_DATE_COL not in ff_header:
    raise ValueError(f"FF_DATE_COL = {FF_DATE_COL!r} not found in FF CSV header!")
if TARGET_COL not in ff_header:
    raise ValueError(f"Target column {TARGET_COL!r} not found in FF CSV header!")

print(f"Using FF date column: {FF_DATE_COL!r}")
print(f"Merging ONLY column:  {TARGET_COL!r}")

# =====================================================================
# STEP 2: Load full MarketData_clean.txt
# =====================================================================
print("\nLoading full MarketData_clean.txt into memory...")

mkt_df = pd.read_csv(
    market_file,
    sep="|",
    dtype="string",
    engine="c"
)

print(f"Rows in MarketData_clean.txt before cleaning: {len(mkt_df):,}")

# Normalize DayDate and drop rows without a valid date
mkt_df["DayDate"] = mkt_df["DayDate"].map(normalize_date)
mkt_df = mkt_df.dropna(subset=["DayDate"])

print(f"Rows in MarketData_clean.txt after dropping rows without DayDate: {len(mkt_df):,}")

# Collect unique market dates
mkt_dates = set(mkt_df["DayDate"].unique())
print(f"Unique DayDate values in MarketData: {len(mkt_dates):,}")

# =====================================================================
# STEP 3: Load FF CSV and build forward-filled 'rf' for all Market dates
# =====================================================================
print(f"\nBuilding FF '{TARGET_COL}' with forward-fill for all Market dates...")

# Load full FF CSV
ff_raw = pd.read_csv(
    ff_csv_file,
    sep=",",
    dtype="string",
    engine="c"
)

# Normalize FF date column
ff_raw[FF_DATE_COL] = ff_raw[FF_DATE_COL].map(normalize_date)
ff_raw = ff_raw.dropna(subset=[FF_DATE_COL])

# --- CRITICAL CHANGE: Keep ONLY Date and 'rf' ---
ff_raw = ff_raw[[FF_DATE_COL, TARGET_COL]].copy()

# Set of FF dates for statistics
ff_dates = set(ff_raw[FF_DATE_COL])

# Drop duplicate dates, keep last entry per date
ff_raw = ff_raw.drop_duplicates(subset=[FF_DATE_COL], keep="last")

# Convert to datetime and sort
ff_raw["_date_dt"] = pd.to_datetime(ff_raw[FF_DATE_COL])
ff_raw = ff_raw.sort_values("_date_dt")

# DataFrame with all Market dates
mkt_dates_sorted = sorted(mkt_dates)
dates_df = pd.DataFrame({"DayDate": mkt_dates_sorted})
dates_df["_date_dt"] = pd.to_datetime(dates_df["DayDate"])

# As-of merge: for each Market date, take last available FF row <= that date
ff_full = pd.merge_asof(
    dates_df.sort_values("_date_dt"),
    ff_raw,
    left_on="_date_dt",
    right_on="_date_dt",
    direction="backward"
)

# Set index to DayDate for fast join later
ff_full = ff_full.set_index("DayDate")

# Drop helper datetime and FF date columns from ff_full
# We only want to keep 'rf'
cols_to_drop = ["_date_dt", FF_DATE_COL, "date"]
existing_drop_cols = [c for c in cols_to_drop if c in ff_full.columns]
if existing_drop_cols:
    ff_full = ff_full.drop(columns=existing_drop_cols)

print(f"FF '{TARGET_COL}' factor with forward-fill built for {len(ff_full)} unique Market dates.")

# =====================================================================
# STEP 4: Quick sanity check – intersection of dates
# =====================================================================
intersection_size = len(mkt_dates & ff_dates)
print("\nDate intersection size (Market ∩ FF original dates):", intersection_size)
if intersection_size == 0:
    print("WARNING: No common dates after normalization – all FF values (if any) will be NaN!")

# =====================================================================
# STEP 5: Merge full MarketData with pre-built forward-filled 'rf'
# =====================================================================
if os.path.exists(out_file):
    os.remove(out_file)

print(f"\nMerging full MarketData with '{TARGET_COL}'...")

merged = mkt_df.merge(
    ff_full, # Contains only index (DayDate) and 'rf'
    left_on="DayDate",
    right_index=True,
    how="left"
)

# =====================================================================
# NEW STEP: Derive 'Country' Column
# =====================================================================
print("\nDeriving 'Country' column from 'ID' (digits 2-4)...")
# Logic: ID "C28011490" -> index 1:4 ("280")
if "ID" in merged.columns:
    merged["Country"] = merged["ID"].str[1:4]
else:
    print("WARNING: 'ID' column not found, skipping Country derivation.")

print("\nChecking unique IDs per Country and filtering...")

if "Country" in merged.columns and "ID" in merged.columns:
    # 1. Count unique IDs for each Country
    country_id_counts = merged.groupby("Country")["ID"].nunique()

    # 2. Identify which countries have <= 30 IDs (to drop)
    #    and which have > 30 IDs (to keep)
    countries_to_drop = country_id_counts[country_id_counts <= 30]
    valid_countries = country_id_counts[country_id_counts > 30].index

    # 3. Track/Log the countries being dropped
    print(f"Found {len(countries_to_drop)} countries with <= 30 unique IDs.")
    if not countries_to_drop.empty:
        print("Dropping the following countries (Count of Unique IDs):")
        # Sorting just for cleaner log output
        for ctry, count in countries_to_drop.sort_values(ascending=False).items():
            print(f"   Country: {ctry} | Unique IDs: {count}")

    # 4. Perform the filter
    rows_before = len(merged)
    merged = merged[merged["Country"].isin(valid_countries)]
    rows_after = len(merged)
    
    print(f"Rows dropped: {rows_before - rows_after:,}")
    print(f"Rows remaining: {rows_after:,}")
else:
    print("WARNING: Skipping filtering because 'Country' or 'ID' column is missing.")    

total_merged_rows = len(merged)

# Write merged rows (all columns from Market + 'rf' + 'Country')
merged.to_csv(
    out_file,
    sep="|",
    index=False,
    mode="w",
    header=True
)

print("\n=== MERGE STATS ===")
print(f"Total merged rows (Market rows with forward-filled '{TARGET_COL}'): {total_merged_rows:,}")
if "Country" in merged.columns:
    print(f"Unique Countries derived: {merged['Country'].nunique()}")

# =====================================================================
# STEP 6: Dates in MarketData not covered by original FF CSV (before forward-fill)
# =====================================================================
missing_in_ff = sorted(mkt_dates - ff_dates)
print("\n=== DATES IN MARKETDATA NOT PRESENT IN ORIGINAL FF CSV (now forward-filled) ===")
print(f"Number of such dates: {len(missing_in_ff):,}")
if missing_in_ff:
    print(f"Examples: {missing_in_ff[:5]}")

# =====================================================================
# STEP 7: Show head (first 50 rows) of FF_Benchmark_data.txt
# =====================================================================
print("\nHead of FF_Benchmark_data.txt (first 50 rows):\n")

if os.path.exists(out_file):
    head_df = pd.read_csv(out_file, sep="|", nrows=50, dtype="string")
    with pd.option_context("display.max_columns", None,
                           "display.width", 200,
                           "display.max_colwidth", 50):
        print(head_df.to_string(index=False))
else:
    print("Output file does not exist; no head to display.")

print("\nFinished creating FF benchmark merge file:")
print(out_file)

# =====================================================================
# STEP 8: Cleanup – explicitly delete large variables to free memory
# =====================================================================
try:
    del ff_full, ff_raw, dates_df, head_df, mkt_dates, ff_dates, missing_in_ff, mkt_df, merged
except NameError:
    pass

gc.collect()

FF CSV file:    /home/jovyan/work/hpool1/pseidel/test/Input/FF_5F+Mom+RrR.csv
Market file:    /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/MarketData_clean.txt
Output file:    /home/jovyan/work/hpool1/pseidel/test/Temp/TempDataPreparation/FF_Benchmark_data_clean.txt


FF CSV exists: True
Market exists: True

MarketData header: ['ID', 'PCUR', 'DayDate', 'MV_LC', 'ret_bps', 'MV_USD']
FF CSV header: ['date', 'mktrf', 'smb', 'hml', 'rmw', 'cma', 'rf', 'umd']
Using FF date column: 'date'
Merging ONLY column:  'rf'

Loading full MarketData_clean.txt into memory...
Rows in MarketData_clean.txt before cleaning: 150,651,519
Rows in MarketData_clean.txt after dropping rows without DayDate: 150,651,519
Unique DayDate values in MarketData: 8,642

Building FF 'rf' with forward-fill for all Market dates...
FF 'rf' factor with forward-fill built for 8642 unique Market dates.

Date intersection size (Market ∩ FF original dates): 8340

Merging full MarketData with 'rf'...

Deriving 'Country' column from 'ID' (digits 2-4)...

Checking unique IDs per Country and filtering...
Found 43 countries with <= 30 unique IDs.
Dropping the following countries (Count of Unique IDs):
   Country: 705 | Unique IDs: 30
   Country: 634 | Unique IDs: 27
   Country: 070 | Unique IDs: 27

0