In [None]:
import os
import pandas as pd
#import re
import numpy as np
from datetime import datetime
from openpyxl import load_workbook
import warnings
import pathlib
import threading
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# MANUAL input to change
your_ing_id = "PY40DL"
date = datetime.strptime('2024', "%Y")
update_final_overview = True
index = 0 # counter to see progression of the pearl library reading. If loop gets stuck or for e.g code crashes then you can set the index to where it stopped (update_final_overview = False)
edition = "August 2025"
########################################################################################################################

# DO NOT CHANGE ANYTHING BELOW - only when developing code
# determining operating system
is_windows = os.name == 'nt'
if is_windows:
    base_dir = os.path.join("C:", "\\Users", your_ing_id, "ING")
else:
    base_dir = pathlib.Path(f"/Users/{your_ing_id}/Library/CloudStorage/OneDrive-SharedLibraries-ING")

# Define paths dynamically
directory_path0 = os.getcwd()
directory_path = os.path.join(base_dir, "Product Evaluation and Risk Assessment Library (PEARL) - PEARL_Repository")
directory_path2 = os.path.join(base_dir, "Product Evaluation and Risk Assessment Library (PEARL) - MI Dashboard")
directory_path3 = os.path.join(directory_path0, "Intermediate results", edition)
directory_path4 = os.path.join(directory_path0, "Final overview")

if not os.path.exists(directory_path3):
    os.makedirs(directory_path3)

if not os.path.exists(directory_path4):
    os.makedirs(directory_path4)

# Print paths
print(f"Operating System: {'Windows' if is_windows else 'Mac/Linux'}")
print(f"PEARL Repository Path: {directory_path}")
print(f"MI Dashboard Path: {directory_path2}")
print(f"Intermediate Results Path: {directory_path3}")


In [None]:
# All the needed functions for this code

# Function to list directories with a timeout mechanism
def list_dir_with_timeout(path, timeout=10):
    files = []

    def target():
        nonlocal files
        try:
            files = os.listdir(path)
        except Exception as e:
            print(f"Error accessing {path}: {e}")

    thread = threading.Thread(target=target)
    thread.start()
    thread.join(timeout)  # Wait for the thread to finish within the timeout

    if thread.is_alive():
        print(f"Timeout reached for {path}, terminating operation")
        return []
    
    return files


def find_matching_strings(strings):
    matching_strings = []
    for string in strings:
        if string.lower().startswith('risk') and 'journey' in string.lower() and (string.lower().endswith('.xlsm') or string.lower().endswith('.xlsx')):
            matching_strings.append(string)
    
    return matching_strings

In [None]:
# folder and ID mapping
file_name = 'Pearl List.xlsx'
overview = pd.read_excel(file_name, engine="openpyxl")
overview['Folder_URL_txt'] = overview['Folder_URL_txt'].str.replace('#$@', ' ')
overview['Folder'] = overview['Folder_URL_txt'].str.replace(r'https://ing.sharepoint.com/sites/PEARL_cs/Shared Documents/Pearl_Repository/', '')
overview['Folder'] = overview['Folder'].str.replace(r'https://ing.sharepoint.com/sites/PEARL_cs/Shared Documents/PEARL_Repository/', '')
cols_overview = ['Folder', 'ID']
overview = overview[cols_overview]
overview['ID'] = overview['ID'].astype(str)
overview

In [None]:
# reading MI dashboard
file_name = 'MI Dashboard.xlsm'
file_path = os.path.join(directory_path2, file_name)

In [None]:
# Changes
mi_dash_changes = pd.read_excel(file_path, sheet_name="Management Info Changes", engine="openpyxl", skiprows=2)
mi_dash_changes.rename(columns={'PEARL ID': 'ID'}, inplace=True)

mi_dash_changes.loc[mi_dash_changes['End Date'] == '(blank)', 'End Date'] = np.nan
mi_dash_changes.loc[mi_dash_changes['Start Date'] == '(blank)', 'Start Date'] = np.nan
mi_dash_changes['End Date'] = pd.to_datetime(mi_dash_changes['End Date'])
mi_dash_changes['Start Date'] = pd.to_datetime(mi_dash_changes['Start Date'])
mi_dash_changes['Type'] = 'Change'

# Reviews
mi_dash_review = pd.read_excel(file_path, sheet_name="Management Info Reviews", engine="openpyxl", skiprows=2)
mi_dash_review.rename(columns={'RJT Review PEARL ID': 'ID', 
                               'RJT Review Start Date': 'Start Date', 
                               'RJT Review Approval Date': 'End Date', 
                               'RJT Review Status': 'Status'}, inplace=True)

# filtering only the relevant ones for our current analysis on the complete ones
mi_dash_review.loc[mi_dash_review['End Date'] == '(blank)', 'End Date'] = np.nan
mi_dash_review.loc[mi_dash_review['Start Date'] == '(blank)', 'Start Date'] = np.nan
mi_dash_review['End Date'] = pd.to_datetime(mi_dash_review['End Date'])
mi_dash_review['Start Date'] = pd.to_datetime(mi_dash_review['Start Date'])
mi_dash_review['Type'] = 'Review'

In [None]:
# creating overview so we know in which risk assesments we are interested

if update_final_overview:

    final_overview = pd.concat([mi_dash_changes, mi_dash_review])

    date= datetime.strptime('2024', "%Y").strftime('%Y-%m-%d')

    cond = ((final_overview['End Date'] >= date) | (final_overview['End Date'].isna())) & (~final_overview['ID'].isna())
    final_overview = final_overview[cond]
    cols = ['ID', 'Start Date', 'End Date', 'Process Category', 'Tribe', 'Status', 'Type', 'Duration']
    final_overview = final_overview[cols]
    final_overview = final_overview.merge(overview, how = 'left', on = 'ID')


    final_overview['End Year'] = final_overview['End Date'].dt.year

    cond = final_overview['Process Category'].isna()
    final_overview.loc[cond, 'Process Category'] = 'empty'

    final_overview_grouped = final_overview.groupby([
        'Process Category', 
        'End Year', 
        'Type']).agg({'ID': 'count'}).reset_index()

    final_overview.to_csv(os.path.join(directory_path4, 'final_overview ' + edition +'.csv'), sep=';', index = False)


final_overview = pd.read_csv(os.path.join(directory_path4, 'final_overview ' + edition +'.csv'), sep=';')
folders_to_check = final_overview.loc[:, 'Folder']
folders_to_check = folders_to_check[~folders_to_check.isna()]


In [None]:
# for testing and cheking specific folder ########
item_to_find = r'Hypotheken - WUB/WUB Hypotheek met Beleggingsrekening/2024-04-15 Intermediate Review product'
temp_index = 0
for f in folders_to_check:
    
    if f == item_to_find:
        break
    
    temp_index = temp_index
    
folder = folders_to_check.iloc[temp_index] # for testing


#folder = folders_to_check.iloc[1] # for testing

In [None]:
n = len(folders_to_check)
folders_to_check = folders_to_check[index:n]

In [None]:
for folder in folders_to_check:
    if is_windows:
        folder = folder.replace("/", "\\")
    else:
        folder = folder.replace("\\", "/")  
        
    complete_folder_path = os.path.join(directory_path, folder) # Proper path handling
    index += 1
    print(f'Folder {index} {folder}')
    
    if os.path.isdir(complete_folder_path):
        files = list_dir_with_timeout(complete_folder_path, timeout=100)
        files = find_matching_strings(files)
    else:
        continue

    index_temp = 0

    for f in files:  # Iterate through filtered files
        index_temp += 1
        file_path = os.path.join(complete_folder_path, f)  # Correct path joining
        
        # Validate file existence
        #if not os.path.exists(file_path):
            #print(f"File does not exist: {file_path}")
            #continue
        #if not file_path.endswith((".xlsx", ".xlsm")):
            #print(f"Skipping non-Excel file: {file_path}")
            #continue
        
        print(f"Processing file: {file_path}")  # Debug output

        try:
            xls = pd.ExcelFile(file_path, engine="openpyxl")
            sheet_names = xls.sheet_names
            print(f"Sheet names found: {sheet_names}")  # Debug output
            modification_time = os.path.getmtime(file_path)
            modification_time = datetime.fromtimestamp(modification_time).strftime('%Y-%m-%d %H:%M:%S')

        except Exception as e:
            print(f'File not opened {file_path}. Error: {e}')
            continue

        if "Process & Module Selection" in sheet_names:
            try:
                df = pd.read_excel(file_path, sheet_name="Process & Module Selection", engine="openpyxl")
                row_index = df.index[df.iloc[:, 1] == 'Risk entity']
                
                print(f"Row index found: {row_index}")  # Debug output
                
                if not row_index.empty:
                    row_index = row_index.tolist()[0]
                    orm_row_index = row_index + 1
                    irm_row_index = row_index + 4
                    orm = str(df.iloc[orm_row_index, 1])
                    orm_value = str(df.iloc[orm_row_index, 5])
                    
                    irm = str(df.iloc[irm_row_index, 1])
                    irm_value = str(df.iloc[irm_row_index, 5])
                    
                    data = pd.DataFrame({
                        'File': [f.lower()], 'ORM_check': [orm], 'ORM': [orm_value],
                        'IRM_check': [irm], 'IRM': [irm_value], 'Folder': [folder],
                        'File date': [modification_time]
                    })
                    
                    print("Data before saving:", data)  # Debug output
                    csv_path = os.path.join(directory_path3, f"{index}_{index_temp}_process_module_selection.csv")
                    data.to_csv(csv_path, sep=";", index=False)
                    print(f"Results saved: {csv_path}")  # Confirmation

            except Exception as e:
                print(f"Process & Module Selection NOT opened in {file_path}. Error: {e}")

        if "Risk Summary & Approval" in sheet_names:
            try:
                df = pd.read_excel(file_path, sheet_name="Risk Summary & Approval", engine="openpyxl")
                row_index = df.index[df.iloc[:,1] == 'Step 4 -  Select Risk Entities to inform or to invite for challenge and agreement']
                
                print(f"Row index found for Risk Summary: {row_index}")  # Debug output
                
                if not row_index.empty:
                    row_index = row_index.tolist()[0]
                    orm_row_index = row_index + 3
                    irm_row_index = row_index + 6
                    orm = str(df.iloc[orm_row_index, 1])
                    orm_value = str(df.iloc[orm_row_index, 4])
                        
                    irm = str(df.iloc[irm_row_index, 1])
                    irm_value = str(df.iloc[irm_row_index, 4])
                        
                    data = pd.DataFrame({
                        'File': [f.lower()], 'ORM_check': [orm], 'ORM_invited_or_challange': [orm_value],
                        'IRM_check': [irm], 'IRM_invited_or_challange': [irm_value], 'Folder': [folder],
                        'File date': [modification_time]
                    })
                    
                    print("Data before saving:", data)  # Debug output
                    csv_path = os.path.join(directory_path3, f"{index}_{index_temp}_risk_summary_approval.csv")
                    data.to_csv(csv_path, sep=";", index=False)
                    print(f"Results saved: {csv_path}")  # Confirmation

            except Exception as e:
                print(f"Risk Summary & Approval NOT opened in {file_path}. Error: {e}")

        if "General Risk Identification" in sheet_names:
            
            try:
                df = pd.read_excel(file_path, sheet_name="General Risk Identification", engine="openpyxl")
                row_index = df.index[df.iloc[:,6] == 'Must be invited?']
                
                if row_index.empty == False:
                    row_index = row_index.tolist()[0]
                    orm_row_index = row_index + 1
                    irm_row_index = row_index + 4
                    orm = str(df.iloc[orm_row_index, 1])
                    orm_value = str(df.iloc[orm_row_index, 6])
                    orm_value2 = str(df.iloc[orm_row_index, 9])
                        
                    irm = str(df.iloc[irm_row_index, 1])
                    irm_value = str(df.iloc[irm_row_index, 6])
                    irm_value2 = str(df.iloc[irm_row_index, 9])
                        
                    data = {'File': [f.lower()], 'ORM_check': [orm],'ORM_invited': [orm_value], 'ORM_part_of_risk_asses': [orm_value2],
                                'IRM_check': [irm], 'IRM_invited': [irm_value], 'IRM_part_of_risk_asses': [irm_value2], 'Folder': [folder]}
                    data = pd.DataFrame(data)
                    data['File date'] = modification_time

                    print("Data before saving:", data)  # Debug output
                    csv_path = os.path.join(directory_path3, f"{index}_{index_temp}_general_risk_ident_1.csv")
                    data.to_csv(csv_path, sep=";", index=False)
                    print(f"Results saved: {csv_path}")  # Confirmation
                   
                    
                row_index = df.index[df.iloc[:,6] == 'Fill in your name (type over email address) if not already indicated']
                
                if row_index.empty == False:
                    row_index = row_index.tolist()[0]
                    orm_row_index = row_index + 1
                    irm_row_index = row_index + 4
                    orm = str(df.iloc[orm_row_index, 1])
                    orm_value = str(df.iloc[orm_row_index, 6])
                    orm_value2 = str(df.iloc[orm_row_index, 8])
                    orm_value3 = str(df.iloc[orm_row_index, 10])
                        
                    irm = str(df.iloc[irm_row_index, 1])
                    irm_value = str(df.iloc[irm_row_index, 6])
                    irm_value2 = str(df.iloc[irm_row_index, 8])
                    irm_value3 = str(df.iloc[irm_row_index, 10])
                        
                    data = {'File': [f.lower()], 'ORM_check': [orm],'ORM_person': [orm_value], 'ORM_opinion': [orm_value2],'ORM_challenge': [orm_value3],
                                'IRM_check': [irm], 'IRM_person': [irm_value], 'IRM_opinion': [irm_value2], 'IRM_challenge': [irm_value3], 'Folder': [folder]}
                    data = pd.DataFrame(data)
                    data['File date'] = modification_time
                    print("Data before saving:", data)  # Debug output
                    csv_path = os.path.join(directory_path3, f"{index}_{index_temp}_general_risk_ident_2.csv")
                    data.to_csv(csv_path, sep=";", index=False)
                    print(f"Results saved: {csv_path}")
                    
                    #result2 = pd.concat([data, result2])
                    
            except Exception as e:
                print(f"General Risk Identification NOT opened {file_path}. Error: {e}")


            if "Risk Journey Log" in sheet_names:
                try:
                    df = pd.read_excel(file_path, sheet_name="Risk Journey Log", engine="openpyxl")
                    row_index = df.index[df.iloc[:,1] == 'Customer Suitability']
                    row_index2 = df.index[df.iloc[:,0] == 'Other PEARL list fields']
                
                    if row_index.empty == False:
                        row_index = row_index.tolist()[0]
                        row_index2 = row_index2.tolist()[0]
                        data = {'Module Title': df.iloc[row_index:(row_index2-1),1].tolist(), 
                                'Applicable': df.iloc[row_index:(row_index2-1),3].tolist()}
                    
                        data = pd.DataFrame(data)
                        cond = (data['Applicable'] == True) | (data['Applicable'] == False)
                        data = data[cond]
                        data['File'] = f.lower()
                        data['Folder'] = folder
                        data['File date'] = modification_time
                    
                        print("Data before saving:", data)  # Debug output
                        csv_path = os.path.join(directory_path3, f"{index}_{index_temp}_module_selected.csv")
                        data.to_csv(csv_path, sep=";", index=False)
                        print(f"Results saved: {csv_path}")  # Confirmation



                    row_index = 0
                    row_index2 = df.index[df.iloc[:,0] == 'Module Info Log']
                
                    if row_index.empty == False:
                        row_index = row_index.tolist()[0]
                        row_index2 = row_index2.tolist()[0] - 2
                        data = {'Action': df.iloc[row_index:(row_index2),2].tolist(), 
                                'value': df.iloc[row_index:(row_index2),3].tolist()}
                    
                        data = pd.DataFrame(data)
                        data['File'] = f.lower()
                        data['Folder'] = folder
                        data['File date'] = modification_time
                    
                        print("Data before saving:", data)  # Debug output
                        csv_path = os.path.join(directory_path3, f"{index}_{index_temp}_assesment_stage.csv")
                        data.to_csv(csv_path, sep=";", index=False)
                        print(f"Results saved: {csv_path}")  # Confirmation

                except Exception as e:
                    print(f"Risk Journey Log NOT opened in {file_path}. Error: {e}")

            if "Param_macro" in sheet_names:
                try:
                    data = pd.read_excel(file_path, sheet_name="Param_macro", engine="openpyxl")
                    data['File'] = f.lower()
                    data['Folder'] = folder
                    data['File date'] = modification_time
                    
                    print("Data before saving:", data)  # Debug output
                    csv_path = os.path.join(directory_path3, f"{index}_{index_temp}_journey_summary.csv")
                    data.to_csv(csv_path, sep=";", index=False)
                    print(f"Results saved: {csv_path}")  # Confirmation

                except Exception as e:
                    print(f"Param_macro NOT opened in {file_path}. Error: {e}")

