In [1]:
import os
import pandas as pd
import re
import numpy as np
from datetime import datetime
from openpyxl import load_workbook
import warnings


warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

import os
import pathlib

is_windows = os.name == 'nt'

your_ing_id = "XO21BM"
date = datetime.strptime('2024', "%Y")

if is_windows:
    base_dir = pathlib.Path(f"C:/Users/{your_ing_id}/OneDrive - SharedLibraries - ING")
else:
    base_dir = pathlib.Path(f"/Users/{your_ing_id}/Library/CloudStorage/OneDrive-SharedLibraries-ING")

# Define paths dynamically
directory_path0 = os.getcwd()
directory_path = base_dir / "Product Evaluation and Risk Assessment Library (PEARL) - PEARL_Repository"
directory_path2 = base_dir / "Product Evaluation and Risk Assessment Library (PEARL) - MI Dashboard"
directory_path3 = directory_path0 + '/Intermediate results/' 

# Print paths
print(f"Operating System: {'Windows' if is_windows else 'Mac/Linux'}")
print(f"PEARL Repository Path: {directory_path}")
print(f"MI Dashboard Path: {directory_path2}")
print(f"Intermediate Results Path: {directory_path3}")


Operating System: Mac/Linux
PEARL Repository Path: /Users/XO21BM/Library/CloudStorage/OneDrive-SharedLibraries-ING/Product Evaluation and Risk Assessment Library (PEARL) - PEARL_Repository
MI Dashboard Path: /Users/XO21BM/Library/CloudStorage/OneDrive-SharedLibraries-ING/Product Evaluation and Risk Assessment Library (PEARL) - MI Dashboard
Intermediate Results Path: /Users/xo21bm/Documents/NFR/Intermediate results/


In [2]:
# folder and ID mapping
overview = pd.read_csv('PEARL List (1).csv', sep=',', low_memory=False)
overview['Folder_URL_txt'] = overview['Folder_URL_txt'].str.replace('#$@', ' ')
overview['Folder'] = overview['Folder_URL_txt'].str.replace(r'https://ing.sharepoint.com/sites/PEARL_cs/Shared Documents/Pearl_Repository/', '')
overview['Folder'] = overview['Folder'].str.replace(r'https://ing.sharepoint.com/sites/PEARL_cs/Shared Documents/PEARL_Repository/', '')
cols_overview = ['Folder', 'ID']
overview = overview[cols_overview]
overview['ID'] = overview['ID'].astype(str)

In [3]:
overview

Unnamed: 0,Folder,ID
0,Sparen/Oranje Spaarrekening/2021-10-29 NIR com...,184
1,Current accounts/Betaalrekening/2021-11-02 LIT...,185
2,Hypotheken - WUB/Z_Other/2021-11-04 Vervallen ...,188
3,,189
4,Current accounts/Zakelijke rekening/2021-11-03...,190
...,...,...
2997,Payments_Giraal betvk/SEPA Direct Debit (credi...,3883
2998,Beleggingsrekeningen/Eenvoudig Beleggen/2025-0...,3884
2999,Verzekeren/New Product/2025-04-16 ING Income P...,3885
3000,KYC/Process change/2025-04-16 STP 2.0,3886


In [4]:
# reading MI dashboard
file_name = 'MI Dashboard.xlsm'
file_path = os.path.join(directory_path2, file_name)

In [5]:
# Changes
# reading MI dashboard
file_name = 'MI Dashboard.xlsm'
file_path = os.path.join(directory_path2, file_name)

# Changes
mi_dash_changes = pd.read_excel(file_path, sheet_name="Management Info Changes", engine="openpyxl", skiprows=2)
mi_dash_changes.rename(columns={'PEARL ID': 'ID'}, inplace=True)

mi_dash_changes.loc[mi_dash_changes['End Date'] == '(blank)', 'End Date'] = np.nan
mi_dash_changes.loc[mi_dash_changes['Start Date'] == '(blank)', 'Start Date'] = np.nan
mi_dash_changes['End Date'] = pd.to_datetime(mi_dash_changes['End Date'])
mi_dash_changes['Start Date'] = pd.to_datetime(mi_dash_changes['Start Date'])


mi_dash_changes['Type'] = 'Change'

In [6]:
mi_dash_changes

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Module Count,Risk Indicator,Risk Score Category,Capacity \nIndicator ORM,ORM Score Category,Capacity \nIndicator IRM,IRM Score Category,count PEARL Id,Outlier Duration,Type
0,,,,,,,,,,,...,,,,,,,,Count of PEARL Id,,Change
1,,,,,,,,,,,...,1,4.0,Low,4.0,Low,0,,1,,Change
2,,,,,,,,,,,...,4,10.0,Medium,8.0,Medium,0,,1,,Change
3,,,,,,,,,,,...,7,15.0,Medium,11.0,High,13,High,1,,Change
4,,,,,,,,,,,...,1,2.0,Low,0.0,,0,,1,,Change
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2803,,,,,,,,,,,...,(blank),0.0,,0.0,,(blank),,1,,Change
2804,,,,,,,,,,,...,(blank),0.0,,0.0,,(blank),,1,,Change
2805,,,,,,,,,,,...,(blank),0.0,,0.0,,(blank),,1,,Change
2806,,,,,,,,,,,...,(blank),0.0,,0.0,,(blank),,1,,Change


In [7]:
# Reviews
mi_dash_review = pd.read_excel(file_path, sheet_name="Management Info Reviews", engine="openpyxl", skiprows=2)
mi_dash_review.rename(columns={'RJT Review PEARL ID': 'ID', 
                               'RJT Review Start Date': 'Start Date', 
                               'RJT Review Approval Date': 'End Date', 
                               'RJT Review Status': 'Status'}, inplace=True)

# filtering only the relevant ones for our current analysis on the complete ones
mi_dash_review.loc[mi_dash_review['End Date'] == '(blank)', 'End Date'] = np.nan
mi_dash_review.loc[mi_dash_review['Start Date'] == '(blank)', 'Start Date'] = np.nan
mi_dash_review['End Date'] = pd.to_datetime(mi_dash_review['End Date'])
mi_dash_review['Start Date'] = pd.to_datetime(mi_dash_review['Start Date'])

mi_dash_review['Type'] = 'Review'

In [8]:
#mi_risk_module = pd.read_excel(file_path, sheet_name="Risk Modules", engine="openpyxl", skiprows=2)

# creating overview so we know in which risk assesments we are interested in
final_overview = pd.concat([mi_dash_changes, mi_dash_review])

date= datetime.strptime('2024', "%Y").strftime('%Y-%m-%d')

cond = ~(final_overview['End Date'] < date) & ~(final_overview['ID'].isna())
final_overview = final_overview[cond]
cols = ['ID', 'Start Date', 'End Date', 'Process Category', 'Tribe', 'Status', 'Type', 'Duration']
final_overview = final_overview[cols]
final_overview = final_overview.merge(overview, how = 'left', on = 'ID')


final_overview['End Year'] = final_overview['End Date'].dt.year

cond = final_overview['Process Category'].isna()
final_overview.loc[cond, 'Process Category'] = 'empty'

final_overview_grouped = final_overview.groupby([
    'Process Category', 
    'End Year', 
    'Type']).agg({'ID': 'count'}).reset_index()

final_overview.to_csv('final_overview.csv', sep=';', index = False)

final_overview = pd.read_csv('final_overview.csv', sep=';')

# prep for the loop
# result1 = pd.DataFrame()
# result2 = pd.DataFrame()
# result3 = pd.DataFrame()
# result4 = pd.DataFrame()

In [9]:
def find_matching_strings(strings):
    matching_strings = []
    for string in strings:
        if string.lower().startswith('risk') and 'journey' in string.lower() and string.lower().endswith('xlsm'):
            matching_strings.append(string)
    
    return matching_strings

folders_to_check = final_overview.loc[:, 'Folder']

# for testing and cheking specific folder ########
item_to_find = r'Hypotheken - WUB/WUB Hypotheek met Beleggingsrekening/2024-04-15 Intermediate Review product'
temp_index = 0
for f in folders_to_check:
    
    if f == item_to_find:
        break
    
    temp_index = temp_index
    
folder = folders_to_check.iloc[temp_index] # for testing
folder = folders_to_check.iloc[1] # for testing



In [10]:
index = 0 # counter to see progression of the loop  
folders_to_check = folders_to_check[~folders_to_check.isna()]
n = len(folders_to_check)
folders_to_check = folders_to_check[index:n]
folder = folders_to_check.iloc[1, ]


In [11]:
import threading
import os

# Function to list directories with a timeout mechanism
def list_dir_with_timeout(path, timeout=10):
    files = []

    def target():
        nonlocal files
        try:
            files = os.listdir(path)
        except Exception as e:
            print(f"Error accessing {path}: {e}")

    thread = threading.Thread(target=target)
    thread.start()
    thread.join(timeout)  # Wait for the thread to finish within the timeout

    if thread.is_alive():
        print(f"Timeout reached for {path}, terminating operation")
        return []
    
    return files


In [None]:
from pathlib import Path
import pandas as pd
import os
from datetime import datetime

for folder in folders_to_check:
    folder = folder.replace("\\", "/")  
    complete_folder_path = Path(directory_path) / folder  # Proper path handling
    index += 1
    print(f'Folder {index} {folder}')
    
    if complete_folder_path.is_dir():
        files = list_dir_with_timeout(complete_folder_path, timeout=10)
        files = find_matching_strings(files)
    else:
        continue

    for f in files:  # Iterate through filtered files
        file_path = complete_folder_path / f  # Correct path joining
        
        # Validate file existence
        if not file_path.exists():
            print(f"File does not exist: {file_path}")
            continue
        if file_path.suffix not in [".xlsx", ".xlsm"]:
            print(f"Skipping non-Excel file: {file_path}")
            continue
        
        print(f"Processing file: {file_path}")  # Debug output

        try:
            xls = pd.ExcelFile(file_path, engine="openpyxl")
            sheet_names = xls.sheet_names
            print(f"Sheet names found: {sheet_names}")  # Debug output
            modification_time = os.path.getmtime(file_path)
            modification_time = datetime.fromtimestamp(modification_time).strftime('%Y-%m-%d %H:%M:%S')

        except Exception as e:
            print(f'File not opened {file_path}. Error: {e}')
            continue

        if "Process & Module Selection" in sheet_names:
            try:
                df = pd.read_excel(file_path, sheet_name="Process & Module Selection", engine="openpyxl")
                row_index = df.index[df.iloc[:, 1] == 'Risk entity']
                
                print(f"Row index found: {row_index}")  # Debug output
                
                if not row_index.empty:
                    row_index = row_index.tolist()[0]
                    orm_row_index = row_index + 1
                    irm_row_index = row_index + 4
                    orm = str(df.iloc[orm_row_index, 1])
                    orm_value = str(df.iloc[orm_row_index, 5])
                    
                    irm = str(df.iloc[irm_row_index, 1])
                    irm_value = str(df.iloc[irm_row_index, 5])
                    
                    data = pd.DataFrame({
                        'File': [f.lower()], 'ORM_check': [orm], 'ORM': [orm_value],
                        'IRM_check': [irm], 'IRM': [irm_value], 'Folder': [folder],
                        'File date': [modification_time]
                    })
                    
                    print("Data before saving:", data)  # Debug output
                    csv_path = Path(directory_path3) / f"{index}_result1.csv"
                    data.to_csv(csv_path, sep=";", index=False)
                    print(f"Results saved: {csv_path}")  # Confirmation

            except Exception as e:
                print(f"Process & Module Selection NOT opened in {file_path}. Error: {e}")

        if "Risk Summary & Approval" in sheet_names:
            try:
                df = pd.read_excel(file_path, sheet_name="Risk Summary & Approval", engine="openpyxl")
                row_index = df.index[df.iloc[:,1] == 'Step 4 -  Select Risk Entities to inform or to invite for challenge and agreement']
                
                print(f"Row index found for Risk Summary: {row_index}")  # Debug output
                
                if not row_index.empty:
                    row_index = row_index.tolist()[0]
                    orm_row_index = row_index + 3
                    irm_row_index = row_index + 6
                    orm = str(df.iloc[orm_row_index, 1])
                    orm_value = str(df.iloc[orm_row_index, 4])
                        
                    irm = str(df.iloc[irm_row_index, 1])
                    irm_value = str(df.iloc[irm_row_index, 4])
                        
                    data = pd.DataFrame({
                        'File': [f.lower()], 'ORM_check': [orm], 'ORM_invited_or_challange': [orm_value],
                        'IRM_check': [irm], 'IRM_invited_or_challange': [irm_value], 'Folder': [folder],
                        'File date': [modification_time]
                    })
                    
                    print("Data before saving:", data)  # Debug output
                    csv_path = Path(directory_path3) / f"{index}_result3.csv"
                    data.to_csv(csv_path, sep=";", index=False)
                    print(f"Results saved: {csv_path}")  # Confirmation

            except Exception as e:
                print(f"Risk Summary & Approval NOT opened in {file_path}. Error: {e}")

Row index found for Risk Summary: Index([], dtype='int64')
Folder 3598 Payments Cards Issuing/Betaalpas zakelijk (cards issuing)/2022-10-04 Soft block business (AppMINGZ)
Processing file: /Users/XO21BM/Library/CloudStorage/OneDrive-SharedLibraries-ING/Product Evaluation and Risk Assessment Library (PEARL) - PEARL_Repository/Payments Cards Issuing/Betaalpas zakelijk (cards issuing)/2022-10-04 Soft block business (AppMINGZ)/RiskJourneyTool Change.xlsm
Sheet names found: ['Instructions', 'Risk journey', 'Change', 'Process & Module Selection', 'ModuleProgression', 'Risk Journey Log', 'Risk Summary & Approval', 'Customer Suitability', 'Scenario Analysis', 'Pricing', 'Personal Data', 'Contract', 'Third-Party Data Processing', 'IT', 'Stakeholder Assessment', 'Inform Supervisory Authority', 'Finance', 'Model Risk', 'Communication to Customers', 'End User Computing', 'Int. Procedure or Policy Change', 'Deposit Guarantee Scheme', 'General Risk Identification', 'Finish & Registration', 'Cancel Jo

In [None]:
complete_folder_path