In [20]:
import os
from pathlib import Path
import pandas as pd
from collections import defaultdict

In [None]:
# Constants for file paths
DATA_DIR = Path("../data")
PROCESSED_DIR = DATA_DIR / "processed"
RMS_FUNDAMENTAL_SCORE_FILE = DATA_DIR / "rms_with_fundamental_score.csv"
RMS_ISIN_LINK_FILE = DATA_DIR / "isin_rms_link.csv"

In [22]:
try:
    df_rms_fundamental_score = pd.read_csv(RMS_FUNDAMENTAL_SCORE_FILE)
    print(f"Loaded {RMS_FUNDAMENTAL_SCORE_FILE}")
except FileNotFoundError:
    print(f"File not found: {RMS_FUNDAMENTAL_SCORE_FILE}")
    raise

# Load the ISIN links data
try:
    df_rms_isin = pd.read_csv(RMS_ISIN_LINK_FILE)
    print(f"Loaded {RMS_ISIN_LINK_FILE}")
except FileNotFoundError:
    print(f"File not found: {RMS_ISIN_LINK_FILE}")
    raise

Loaded ../data/rms_with_fundamental_score.csv
Loaded ../data/isin_rms_link.csv


In [5]:
# Display the number of unique RmsIds in each dataframe
unique_rms_fundamental = df_rms_fundamental_score['RmsId'].nunique()
unique_rms_isin = df_rms_isin['RmsId'].nunique()

print(f"Unique RmsIds in Fundamental Scores: {unique_rms_fundamental}")
print(f"Unique RmsIds in ISIN Links: {unique_rms_isin}")

Unique RmsIds in Fundamental Scores: 631
Unique RmsIds in ISIN Links: 398


In [6]:
def find_missing_prospectuses(df, processed_dir):
    """
    Identify RmsIds with missing or empty prospectus directories.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'RmsId'.
        processed_dir (Path): Path to the processed data directory.
        
    Returns:
        list: List of RmsIds with missing prospectuses.
    """
    missing_files = []
    for rms_id in df['RmsId'].unique():
        folder_path = processed_dir / str(rms_id) / "as_expected"
        if not folder_path.exists() or not any(folder_path.iterdir()):
            missing_files.append(rms_id)
    return missing_files

missing_files = find_missing_prospectuses(df_rms_fundamental_score, PROCESSED_DIR)
print(f"Number of RmsIds with missing Prospectuses: {len(missing_files)}")

Number of RmsIds with missing Prospectuses: 508


In [8]:
result_df = pd.merge(
    df_rms_isin,
    df_rms_fundamental_score,
    on="RmsId",
    how="left",
    indicator=True  # To identify unmatched rows
)

In [12]:
# Ensure IssueDate and ScoringDate are datetime
result_df['IssueDate'] = pd.to_datetime(result_df['IssueDate'])
result_df['ScoringDate'] = pd.to_datetime(result_df['ScoringDate'])

# Calculate the absolute difference between IssueDate and ScoringDate
result_df['DateDifference'] = (result_df['IssueDate'] - result_df['ScoringDate']).abs()

# Sort by 'RmsId', 'ScoringDate', and 'DateDifference'
result_df = result_df.sort_values(
    by=['RmsId', 'ScoringDate', 'DateDifference']
).reset_index(drop=True)

# Drop the 'DateDifference' column if not needed
result_df = result_df.drop(columns=['DateDifference'])

# Drop rows with missing 'ScoringDate'
df_cleaned = result_df.dropna(subset=['ScoringDate']).copy()

print(f"Data after cleaning: {df_cleaned.shape[0]} rows")


Data after cleaning: 9748 rows


In [None]:
def group_isins(df):
    """
    Group ISINs by RmsId and ScoringDate.
    
    Parameters:
        df (pd.DataFrame): Cleaned DataFrame with necessary columns.
        
    Returns:
        list: List containing [RmsId, [[ScoringDate, ISINs], ...]] for each RmsId.
    """
    grouped = defaultdict(list)
    
    for (rms_id, scoring_date), group in df.groupby(['RmsId', 'ScoringDate']):
        ordered_isins = list(dict.fromkeys(group['ISIN'].tolist()))
        grouped[rms_id].append([scoring_date, ordered_isins])
    
    # Convert defaultdict to the desired list format
    final_result = [
        [rms_id, scoring_date_lists]
        for rms_id, scoring_date_lists in grouped.items()
    ]
    
    return final_result

final_result = group_isins(df_cleaned)

import pickle
# Save the final result to a pickle file
# with open(DATA_DIR / "grouped_isins.pkl", "wb") as f:
#     pickle.dump(final_result, f)

final_result[:3]

[[np.int64(5),
  [[Timestamp('2022-06-29 00:00:00'),
    ['DE000A3LF6J0',
     'XS2336188029',
     'XS2283225477',
     'XS2283224231',
     'XS2248826294',
     'XS2010029663',
     'DE000A3L3AG9',
     'DE000A3L3AE4',
     'DE000A3L3AD6',
     'DE000A3L3AJ3',
     'DE000A3L3AH7',
     'XS1843441491',
     'DE000A2RUD79',
     'XS1713464441',
     'XS1713464524',
     'XS1731858392',
     'XS1731858715',
     'XS1652965085']],
   [Timestamp('2022-08-10 00:00:00'),
    ['DE000A3LF6J0',
     'XS2336188029',
     'XS2283225477',
     'XS2283224231',
     'XS2248826294',
     'XS2010029663',
     'DE000A3L3AG9',
     'DE000A3L3AE4',
     'DE000A3L3AD6',
     'DE000A3L3AJ3',
     'DE000A3L3AH7',
     'XS1843441491',
     'DE000A2RUD79',
     'XS1713464441',
     'XS1713464524',
     'XS1731858392',
     'XS1731858715',
     'XS1652965085']],
   [Timestamp('2023-02-24 00:00:00'),
    ['DE000A3LF6J0',
     'DE000A3L3AG9',
     'DE000A3L3AE4',
     'DE000A3L3AD6',
     'DE000A3L3AJ3',
     '