In [None]:
import os
import pandas as pd
#import re
import numpy as np
from datetime import datetime
from openpyxl import load_workbook
import warnings
import pathlib 
from pathlib import Path 
import threading
from tqdm import tqdm

warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# MANUAL input to change
your_ing_id = "XO21BM"
date = datetime.strptime('2024', "%Y")
update_final_overview = True

# DO NOT CHANGE ANYTHING BELOW - only when developing code
# determining operating system
is_windows = os.name == 'nt'
if is_windows:
    base_dir = os.path.join("C:", "\\Users", your_ing_id, "ING")
else:
    base_dir = pathlib.Path(f"/Users/{your_ing_id}/Library/CloudStorage/OneDrive-SharedLibraries-ING")

# Define paths dynamically
directory_path0 = os.getcwd()
directory_path = os.path.join(base_dir, "Product Evaluation and Risk Assessment Library (PEARL) - PEARL_Repository")
directory_path2 = os.path.join(base_dir, "Product Evaluation and Risk Assessment Library (PEARL) - MI Dashboard")
directory_path3 = os.path.join(directory_path0, "Intermediate results")
final_overview = pd.read_csv('final_overview.csv', sep=';')

# Print paths
print(f"Operating System: {'Windows' if is_windows else 'Mac/Linux'}")
print(f"PEARL Repository Path: {directory_path}")
print(f"MI Dashboard Path: {directory_path2}")
print(f"Intermediate Results Path: {directory_path3}")

In [None]:
# Initialize DataFrames
dt1, dt2, dt3, dt4, dt5, dt6, dt7 = [pd.DataFrame() for _ in range(7)]

# Check if directory is empty
all_files = os.listdir(directory_path3)
if not all_files:
    raise ValueError("Error: directory_path3 is empty!")

# Process files
for f in tqdm(all_files):
    file_path = os.path.join(directory_path3, f)

    # Ensure it's a file, not a directory
    if os.path.isfile(file_path):
        try:
            dt_f = pd.read_csv(file_path, sep=';', encoding='utf-8')
            if 'process_module_selection' in f:
                dt1 = pd.concat([dt1, dt_f])
            elif 'risk_summary_approval' in f:
                dt2 = pd.concat([dt2, dt_f])
            elif 'general_risk_ident_1' in f:
                dt3 = pd.concat([dt3, dt_f])
            elif 'general_risk_ident_2' in f:
                dt4 = pd.concat([dt4, dt_f])
            elif 'module_selected' in f:
                dt5 = pd.concat([dt5, dt_f])
            elif 'assesment_stage' in f:
                dt6 = pd.concat([dt6, dt_f])
            elif 'journey_summary' in f:
                dt7 = pd.concat([dt7, dt_f])

        except Exception as e:
            print(f"Error processing file {f}: {e}")

# Create backup copies before modification
dataframes = [dt1, dt2, dt3, dt4, dt5, dt6, dt7]
dataframes_backup = [df.copy() for df in dataframes]

# Normalize folder paths (Mac-safe)
for i in range(len(dataframes)):
    if 'Folder' in dataframes[i].columns:
        dataframes[i]['Folder'] = dataframes[i]['Folder'].apply(lambda x: str(Path(x)))

In [None]:
# merge with final overview each 
dt1 = dt1.merge(final_overview, on="Folder", how="left")
dt2 = dt2.merge(final_overview, on="Folder", how="left")
dt3 = dt3.merge(final_overview, on="Folder", how="left")
dt4 = dt4.merge(final_overview, on="Folder", how="left")
dt5 = dt5.merge(final_overview, on="Folder", how="left")

In [None]:
### data manipulation for each individual df process_module_selection

cond = dt1['File'].isna()
dt1.loc[cond, "Missing data"] = True
dt1.loc[~cond, "Missing data"] = False

cond = (dt1['ORM_check'] == 'ORM') & (dt1['IRM_check'] == 'IRM')
dt1.loc[cond, "Data correct"] = True  
dt1.loc[~cond, "Data correct"] = False  

cond = (dt1['IRM'] == 'Yes')
dt1.loc[cond, 'IRM'] = '1'
dt1.loc[~cond, 'IRM'] = '0'

dt1['IRM'] = pd.to_numeric(dt1['IRM'], errors='coerce')

cond = (dt1['ORM'] == 'Yes')
dt1.loc[cond, 'ORM'] = '1'
dt1.loc[~cond, 'ORM'] = '0'

dt1['ORM'] = pd.to_numeric(dt1['ORM'], errors='coerce')

In [None]:
# IRM & ORM dt2 risk summary approval 
cond = dt2['File'].isna()
dt2.loc[cond, 'Missing data'] = True
dt2.loc[~cond, 'Missing data'] = False

cond = (dt2['ORM_check'] == 'ORM') & (dt2['IRM_check'] == 'IRM')
dt2.loc[cond, 'Data correct'] = True
dt2.loc[~cond, 'Data correct'] = False

cond = (dt2['ORM_invited_or_challange'] == 'To be invited for challenge')
dt2.loc[cond, 'ORM_invited_or_challange'] = '1'
dt2.loc[~cond, 'ORM_invited_or_challange'] = '0'

dt2['ORM_invited_or_challange'] = pd.to_numeric(dt2['ORM_invited_or_challange'], errors='coerce')

cond = (dt2['IRM_invited_or_challange'] == 'To be informed')
dt2.loc[cond, 'IRM_invited_or_challange'] = '1'
dt2.loc[~cond, 'IRM_invited_or_challange'] = '0'

dt2['IRM_invited_or_challange'] = pd.to_numeric(dt2['IRM_invited_or_challange'], errors='coerce')

In [None]:
# general_risk_ident_1
cond = dt3['File'].isna()
dt3.loc[cond, 'Missing data'] = True
dt3.loc[~cond, 'Missing data'] = False

cond = (dt3['ORM_check'] == 'ORM') & (dt3['IRM_check'] == 'IRM')
dt3.loc[cond, 'Data correct'] = True
dt3.loc[~cond, 'Data correct'] = False

cond = (dt3['ORM_invited'] == 'Yes')
dt3.loc[cond, 'ORM_invited'] = '1'
dt3.loc[~cond, 'ORM_invited'] = '0'

dt3['ORM_invited'] = pd.to_numeric(dt3['ORM_invited'], errors='coerce')

cond = (dt3['IRM_invited'] == 'Yes')
dt3.loc[cond, 'IRM_invited'] = '1'
dt3.loc[~cond, 'IRM_invited'] = '0'

dt3['IRM_invited'] = pd.to_numeric(dt3['IRM_invited'], errors='coerce')

cond = (dt3['ORM_part_of_risk_asses'] == 'Yes')
dt3.loc[cond, 'ORM_part_of_risk_asses'] = '1'
dt3.loc[~cond, 'ORM_part_of_risk_asses'] = '0'

dt3['ORM_part_of_risk_asses'] = pd.to_numeric(dt3['ORM_part_of_risk_asses'], errors='coerce')

cond = (dt3['IRM_part_of_risk_asses'] == 'Yes')
dt3.loc[cond, 'IRM_part_of_risk_asses'] = '1'
dt3.loc[~cond, 'IRM_part_of_risk_asses'] = '0'

dt3['IRM_part_of_risk_asses'] = pd.to_numeric(dt3['IRM_part_of_risk_asses'], errors='coerce')

In [None]:
# general_risk_ident_2
cond = dt4['File'].isna()
dt4.loc[cond, 'Missing data'] = True
dt4.loc[~cond, 'Missing data'] = False

cond = (dt4['ORM_check'] == 'ORM') & (dt4['IRM_check'] == 'IRM')
dt4.loc[cond, 'Data correct'] = True
dt4.loc[~cond, 'Data correct'] = False

cond = (dt4['ORM_person'] == 'Not applicable')
dt4.loc[cond, 'ORM_person'] = '0'
dt4.loc[~cond, 'ORM_person'] = '1'

dt4['ORM_person'] = pd.to_numeric(dt4['ORM_person'], errors='coerce')

cond = (dt4['IRM_person'] == 'Not applicable')
dt4.loc[cond, 'IRM_person'] = '0'
dt4.loc[~cond, 'IRM_person'] = '1'

dt4['IRM_person'] = pd.to_numeric(dt4['IRM_person'], errors='coerce')

cond = (dt4['ORM_opinion'] == 'Not applicable')
dt4.loc[cond, 'ORM_opinion'] = '0'
dt4.loc[~cond, 'ORM_opinion'] = '1'

dt4['ORM_opinion'] = pd.to_numeric(dt4['ORM_opinion'], errors='coerce')

cond = (dt4['IRM_opinion'] == 'Not applicable')
dt4.loc[cond, 'IRM_opinion'] = '0'
dt4.loc[~cond, 'IRM_opinion'] = '1'

dt4['IRM_opinion'] = pd.to_numeric(dt4['IRM_opinion'], errors='coerce')

cond = (dt4['ORM_challenge'] == 'Not applicable')
dt4.loc[cond, 'ORM_challenge'] = '0'
dt4.loc[~cond, 'ORM_challenge'] = '1'

dt4['ORM_challenge'] = pd.to_numeric(dt4['ORM_challenge'], errors='coerce')

cond = (dt4['IRM_challenge'] == 'Not applicable')
dt4.loc[cond, 'IRM_challenge'] = '0'
dt4.loc[~cond, 'IRM_challenge'] = '1'

dt4['IRM_challenge'] = pd.to_numeric(dt4['IRM_challenge'], errors='coerce')

In [None]:
dt1['End Date'] = dt1['End Date'].fillna(pd.Timestamp(datetime.today().date()))

In [None]:
### first data analysis dt1 used for the main reporting in performance wall
### creating categories to report days open 

from datetime import datetime

# dt1
dt1['Start Date'] = pd.to_datetime(dt1['Start Date'])
dt1["End Date"] = pd.to_datetime(dt1["End Date"])  
dt1["Year"] = dt1["End Date"].dt.year
dt1["Quarter"] = dt1["End Date"].dt.to_period("Q")
dt1["Quarter"] = dt1["Quarter"].fillna("Unknown").astype(str)
dt1["Quarter"] = dt1["End Date"].dt.to_period("Q").astype(str)

dt1["YTD"] = datetime.today().date()


dt1["End Date"] = dt1["End Date"].fillna(pd.Timestamp(datetime.today().date()))
dt1["No_End_date"] = dt1["End Date"] == dt1["YTD"]

# calculate duration
dt1["Duration"] = (dt1["End Date"] - dt1["Start Date"]).dt.days
dt1["Duration"] = pd.to_numeric(dt1["Duration"], errors='coerce')

# create categories
def categorize_days(x):
    if x < 30:
        return "<30 days"
    elif x < 60:
        return "30-60 days"
    elif x < 90:
        return "60-90 days"
    elif x < 120:
        return "90-120 days"
    elif x < 180:
        return "120-180 days"
    else:
        return ">180 days"

dt1['Category'] = dt1['Duration'].apply(categorize_days)

# just to check categorisation
#count_table = dt1['Category'].value_counts().reset_index()
#count_table.columns = ['Unique Value', 'Count']
#print(count_table)

# Final filter fot dt1
filtered_data = dt1[dt1["Process Category"].isin(["Risk assessment (non-PAP)", "Other change (non-PAP)"])]

# filter_data is also input continued analysis on dt 1

In [None]:
### PAP output same output, but filtered on non PAP
filtered_data_PAP = dt1[dt1["Process Category"].isin(["New financial product/-service/channel (PAP)", "Significant change financial product/-service/channel (PAP) ", "Termination of financial product/-service/channel (PAP)"])]
#filtered_data_PAP

dt1_grouped_PAP = filtered_data_PAP.groupby(['Missing data', 
                           'Data correct', 
                           'Type',
                           'Quarter']).agg({'ORM': 'sum', 
                                       'IRM': 'sum', 
                                       'Folder': 'count'})
#dt1_grouped_PAP

In [None]:
### pivot for overall dt1 output 
dt1_grouped = filtered_data.groupby(['Missing data', 
                           'Data correct', 
                           'Type',
                           'Quarter']).agg({'ORM': 'sum', 
                                       'IRM': 'sum', 
                                       'Folder': 'count'})
#dt1_grouped

In [None]:
# create percentages
dt1_percent = dt1_grouped.copy()

# Divide ORM & IRM by total Folder count per quarter
cols_to_percent = ["ORM", "IRM"]
dt1_percent[cols_to_percent] = dt1_percent[cols_to_percent].div(
    dt1_percent["Folder"], axis=0) * 100
dt1_percent[cols_to_percent] = dt1_percent[cols_to_percent].round(2)
#dt1_percent

In [None]:
### first data analysis for dt2 to report # invited_challenges

# dt2
dt2["End Date"] = pd.to_datetime(dt2["End Date"])  # Convert to datetime
dt2["Year"] = dt2["End Date"].dt.year
dt2["Quarter"] = dt2["End Date"].dt.to_period("Q")
dt2["Quarter"] = dt2["Quarter"].fillna("Unknown").astype(str)
dt2["Quarter"] = dt2["End Date"].dt.to_period("Q").astype(str)

#filtered_data = dt2[dt2["Process Category"].isin(["Risk assessment (non-PAP)", "Other change (non-PAP)"])]

dt2_grouped = dt2.groupby(['Missing data', 
                           'Data correct', 
                           'Quarter']).agg({'ORM_invited_or_challange': 'sum', 
                                       'IRM_invited_or_challange': 'sum', 
                                       'Folder': 'count'})

# for dt2: left out the filter on process category as its empty in almost all caes
#dt2_grouped

In [None]:
# create percentages
dt2_percent = dt2_grouped.copy()

# Divide ORM & IRM by total Folder count per quarter
cols_to_percent = ["ORM_invited_or_challange", "IRM_invited_or_challange"]
dt2_percent[cols_to_percent] = dt2_percent[cols_to_percent].div(
    dt2_percent["Folder"], axis=0) * 100
dt2_percent[cols_to_percent] = dt2_percent[cols_to_percent].round(2)
#dt2_percent

In [None]:
### first data analysis for dt3

# dt3
dt3["End Date"] = pd.to_datetime(dt3["End Date"])  # Convert to datetime
dt3["Year"] = dt3["End Date"].dt.year

dt3["Quarter"] = dt3["End Date"].dt.to_period("Q")
dt3["Quarter"] = dt3["Quarter"].fillna("Unknown").astype(str)
dt3["Quarter"] = dt3["End Date"].dt.to_period("Q").astype(str)

filtered_data = dt3[dt3["Process Category"].isin(["Risk assessment (non-PAP)", "Other change (non-PAP)"])]


dt3_grouped = filtered_data.groupby(['Missing data', 
                           'Data correct', 
                           'Quarter']).agg({'ORM_invited': 'sum', 
                                         'IRM_invited': 'sum',    
                                         'ORM_part_of_risk_asses': 'sum', 
                                         'IRM_part_of_risk_asses': 'sum', 
                                         'Folder': 'count'})
#dt3_grouped

In [None]:
# create percentages
dt3_percent = dt3_grouped.copy()

# Divide ORM & IRM by total Folder count per quarter
cols_to_percent = ["ORM_invited", "IRM_invited", "ORM_part_of_risk_asses", "IRM_part_of_risk_asses"]
dt3_percent[cols_to_percent] = dt3_percent[cols_to_percent].div(
    dt3_percent["Folder"], axis=0) * 100
dt3_percent[cols_to_percent] = dt3_percent[cols_to_percent].round(2)
#dt3_percent

In [None]:
### first data analysis dt4
 
# dt4
dt4["End Date"] = pd.to_datetime(dt4["End Date"])  # Convert to datetime
dt4["Year"] = dt4["End Date"].dt.year

dt4["Quarter"] = dt4["End Date"].dt.to_period("Q")
dt4["Quarter"] = dt4["Quarter"].fillna("Unknown").astype(str)
dt4["Quarter"] = dt4["End Date"].dt.to_period("Q").astype(str)

filtered_data = dt4[dt4["Process Category"].isin(["Risk assessment (non-PAP)", "Other change (non-PAP)"])]


dt4_grouped = filtered_data.groupby(['Missing data', 
                           'Data correct', 
                           'Quarter']).agg({'ORM_person': 'sum', 
                                         'IRM_person': 'sum',    
                                         'ORM_opinion': 'sum', 
                                         'IRM_opinion': 'sum',
                                         'ORM_challenge': 'sum', 
                                         'IRM_challenge': 'sum', 
                                         'Folder': 'count'})

#dt4_grouped

In [None]:
# create percentages
dt4_percent = dt4_grouped.copy()

# Divide ORM & IRM by total Folder count per quarter
cols_to_percent = ["ORM_person", "IRM_person", "ORM_opinion", "IRM_opinion", "ORM_challenge", "IRM_challenge"]
dt4_percent[cols_to_percent] = dt4_percent[cols_to_percent].div(
    dt4_percent["Folder"], axis=0) * 100
dt4_percent[cols_to_percent] = dt4_percent[cols_to_percent].round(2)
#dt4_percent

In [None]:
### module_selected

dt5["End Date"] = pd.to_datetime(dt5["End Date"], errors="coerce")  
dt5["Year"] = dt5["End Date"].dt.year  
dt5["Quarter"] = dt5["End Date"].dt.to_period("Q")
dt5["Quarter"] = dt5["Quarter"].fillna("Unknown").astype(str)
dt5["Quarter"] = dt5["End Date"].dt.to_period("Q").astype(str)

# add column with sum # modules selected
df_5_module_selected =dt5.groupby(["Folder", 
                                    "File",
                                    "Quarter"]).agg({
    "Applicable": lambda x: x.sum()  
}).reset_index()

df_5_module_selected.rename(columns={"Applicable": "Applicable_modules_selected"}, inplace=True)
df_5_module_selected["Quarter"] = df_5_module_selected["Quarter"].astype(str)

### dt7 -> needs to do the same with this one

In [None]:
### first data analysis dt5 with # modules selected
df_wide = df_5_module_selected.pivot_table(index="Quarter", 
                                           columns="Applicable_modules_selected", 
                                           values="File", 
                                           aggfunc="count").fillna(0)

# Reset index to make it cleaner
df_wide.reset_index(inplace=True)
#df_wide

In [None]:
df_percent = df_wide.copy()  
module_cols = df_percent.columns.difference(["Quarter"])
df_percent[module_cols] = df_percent[module_cols].div(df_percent[module_cols].sum(axis=1), axis=0) * 100
df_percent[module_cols] = df_percent[module_cols].round(2)
#df_percent

In [None]:
# save the manipulated versions 

dt1.to_csv("dt1.csv", index=False)
dt2.to_csv("dt2.csv", index=False)
dt3.to_csv("dt3.csv", index=False)
dt4.to_csv("dt4.csv", index=False)
df_5_module_selected.to_csv("df_5_module_selected.csv", index=False) 

In [None]:
# table with duraction RA's per category Richards request

dt1_grouped1 = filtered_data.groupby(['Missing data', 
                           'Data correct', 
                           'Type',
                           'Category',
                           'Quarter']).agg({#'Category': 'count', 
                                       'Folder': 'count'})

import pandas as pd

category_order = ["<30 days", "30-60 days", "60-90 days", "90-120 days", "120-180 days", ">180 days"]
dt1_grouped1 = dt1_grouped1.reset_index()
dt1_grouped1['Category'] = pd.Categorical(dt1_grouped1['Category'], categories=category_order, ordered=True)
dt1_grouped1 = dt1_grouped1.sort_values(by='Category')
#dt1_grouped1

In [None]:
## pivot duration categories * quarters
dt1_pivoted = dt1_grouped1.pivot_table(
    index=['Missing data', 'Data correct', 'Type', 'Category'],
    columns='Quarter',
    values=['Category', 'Folder'],  
    fill_value=0  
)
#dt1_pivoted

In [None]:
# now the same for the ones still open, Richards request

filtered_dt1 =filtered_data[filtered_data["No_End_date"] == True]

dt1_grouped2 = filtered_dt1.groupby(['Missing data', 
                           'Data correct', 
                           'Type',
                           'No_End_date',
                           'Category',
                           'Quarter']).agg({#'Category': 'count', 
                                       'Folder': 'count'})

dt1_grouped2

import pandas as pd

category_order = ["<30 days", "30-60 days", "60-90 days", "90-120 days", "120-180 days", ">180 days"]
dt1_grouped2 = dt1_grouped2.reset_index()
dt1_grouped2['Category'] = pd.Categorical(dt1_grouped2['Category'], categories=category_order, ordered=True)
dt1_grouped2 = dt1_grouped2.sort_values(by='Category')
dt1_grouped2

dt1_pivoted2 = dt1_grouped2.pivot_table(
    index=['Missing data', 'Data correct', 'Type', 'Category'],
    columns='Quarter',
    values=['Category', 'Folder'],  
    fill_value=0  
)
#dt1_pivoted2

In [None]:
#filtered_dt1
filtered_dt1.to_csv("filtered_data.csv", index=False)

In [None]:
## analysis duration in days average per tribe to see whether there are some tribes with more duration. Results do not show very clear differences

filtered_data = dt1[dt1["Process Category"].isin(["Risk assessment (non-PAP)", "Other change (non-PAP)"])]

dt1_grouped2 = filtered_data.groupby(['Missing data', 
                           'Data correct', 
                           'Type',
                           'Tribe',
                           'Quarter']).agg({'Duration': 'mean', 
                                       'Folder': 'count'})
#dt1_grouped2

In [None]:
## analysis duration in days average per tribe pivot

dt1_pivoted2 = dt1_grouped2.pivot_table(
    index=['Missing data', 'Data correct', 'Type', 'Tribe'],
    columns=['Quarter'],  
    values=['Duration'],  
    fill_value=0  
)
#dt1_pivoted2

In [None]:
## to interrpet the means duration get totals per tribe 

filtered_data = dt1[dt1["Process Category"].isin(["Risk assessment (non-PAP)", "Other change (non-PAP)"])]

dt1_grouped3 = filtered_data.groupby(['Missing data', 
                           'Data correct', 
                           'Type',
                           'Tribe',
                           'Quarter']).agg({
                                       'Folder': 'count'})
#dt1_grouped3

In [None]:
## to interrpet the means duration get totals per tribe 
dt1_pivoted3 = dt1_grouped3.pivot_table(
    index=['Missing data', 'Data correct', 'Type', 'Tribe'],
    columns=['Quarter'],  
    values=['Folder'],  
    fill_value=0  
)
#dt1_pivoted3

In [None]:
## trying to connect dt3 & dt4 to have the information from sheet risk_identification in one file

import pandas as pd
merged_df = pd.merge(dt3, dt4, on=['Folder', 'File'], how='inner', suffixes=('', '_dup'))
merged_df = merged_df.loc[:, ~merged_df.columns.str.endswith('_dup')]

merged_df
merged_df2 = merged_df[merged_df["Process Category"].isin(["Risk assessment (non-PAP)", "Other change (non-PAP)"])]
merged_df_pivot = merged_df2.groupby(['Missing data', 
                           'Data correct', 
                          # 'Type',
                           'ORM_invited',
                           'IRM_invited',
                           'ORM_part_of_risk_asses',
                           'IRM_part_of_risk_asses', 
                           'ORM_person',
                           'IRM_person',
                           #'Quarter'
                                     ]).agg({
                                       'Folder': 'count'})

merged_df_pivot

### does not really add up to something meaningful. 