In [None]:
!pip install pm4py

import pandas as pd
import pm4py
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', None)

Collecting pm4py
  Downloading pm4py-2.7.19.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pm4py-2.7.19.3-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pm4py
Successfully installed pm4py-2.7.19.3




In [None]:
# 1. Caricamento Dataset
filepath = '/content/dataset_for_exam.csv'

df = pd.read_csv(filepath)
# Adatto allo standard XES
df.rename(columns={
    'stay_id': 'case:concept:name',    # Case ID
    'activity': 'concept:name',        # Activity
    'time': 'time:timestamp'           # Timestamp
}, inplace=True)

# 3. Conversione Data e Ordinamento
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])
df['case:concept:name'] = df['case:concept:name'].astype(str)

# Ordiniamo per Caso e Tempo
df = df.sort_values(by=['case:concept:name', 'time:timestamp'])

print("Dataset caricato e rinominato. Prime righe:")
df.head()

Dataset caricato e rinominato. Prime righe:


Unnamed: 0,case:concept:name,time:timestamp,concept:name,gender,race,arrival_transport,disposition,diagnosis_sequence,diagnosis_code,diagnosis_description,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,rhythm,drug_name,generic_drug_code,national_drug_code,reconciliation_nurse_id,drug_class_code,drug_class_classification,administering_nurse_id
0,30005370,2110-06-25 05:10:00,Enter the ED,F,WHITE,AMBULANCE,,,,,,,,,,,,,,,,,,,,,
1,30005370,2110-06-25 05:10:01,Triage in the ED,,,,,,,,98.0,90.0,15.0,95.0,130.0,58.0,4.0,3.0,SBO,,,,,,,,
2,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,ondansetron HCl [Zofran],16392.0,16590050000.0,1.0,457.0,Antiemetic - Selective Serotonin 5-HT3 Antagon...,
3,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,omeprazole [Prilosec],43137.0,16714070000.0,1.0,445.0,Gastric Acid Secretion Reducing Agents - Proto...,
4,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,acetaminophen [Acetaminophen Extra Strength],4490.0,10003010000.0,1.0,577.0,Analgesic or Antipyretic Non-Opioid,


In [None]:
# Verifica Start e End activities
start_activities = pm4py.get_start_activities(df, case_id_key="case:concept:name")
end_activities = pm4py.get_end_activities(df, case_id_key="case:concept:name")

print("\nAttività iniziali:", start_activities)
print("Attività finali:", end_activities)

df


Attività iniziali: {'Enter the ED': 1820}
Attività finali: {'Discharge from the ED': 1820}


Unnamed: 0,case:concept:name,time:timestamp,concept:name,gender,race,arrival_transport,disposition,diagnosis_sequence,diagnosis_code,diagnosis_description,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,rhythm,drug_name,generic_drug_code,national_drug_code,reconciliation_nurse_id,drug_class_code,drug_class_classification,administering_nurse_id
0,30005370,2110-06-25 05:10:00,Enter the ED,F,WHITE,AMBULANCE,,,,,,,,,,,,,,,,,,,,,
1,30005370,2110-06-25 05:10:01,Triage in the ED,,,,,,,,98.0,90.0,15.0,95.0,130.0,58.0,4,3.0,SBO,,,,,,,,
2,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,ondansetron HCl [Zofran],16392.0,1.659005e+10,1.0,457.0,Antiemetic - Selective Serotonin 5-HT3 Antagon...,
3,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,omeprazole [Prilosec],43137.0,1.671407e+10,1.0,445.0,Gastric Acid Secretion Reducing Agents - Proto...,
4,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,acetaminophen [Acetaminophen Extra Strength],4490.0,1.000301e+10,1.0,577.0,Analgesic or Antipyretic Non-Opioid,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25110,39999292,2110-08-15 00:48:00,Vital sign check,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,,,,,,,,,,
25111,39999292,2110-08-15 00:48:01,Triage in the ED,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,3.0,Cough,,,,,,,,
25112,39999292,2110-08-15 01:23:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,Prozac,46213.0,1.054406e+10,1.0,530.0,Antidepressant - Selective Serotonin Reuptake ...,
25113,39999292,2110-08-15 02:57:00,Vital sign check,,,,,,,,98.0,87.0,18.0,99.0,136.0,82.0,0,,,,,,,,,,


In [None]:
# Salvo la lunghezza iniziale per confronto
initial_len = len(df)

# 1. Rimozione righe con valori essenziali mancanti
df.dropna(subset=['time:timestamp', 'case:concept:name', 'concept:name'], inplace=True)

# 2. Rimozione DUPLICATI ESATTI
# Se un paziente ha la stessa attività allo stesso identico secondo, è un errore di registrazione
df.drop_duplicates(subset=['case:concept:name', 'time:timestamp', 'concept:name'], keep='first', inplace=True)

print(f"Righe rimosse (missing o duplicati): {initial_len - len(df)}")

# Verifica Start e End activities
start_activities = pm4py.get_start_activities(df, case_id_key="case:concept:name")
end_activities = pm4py.get_end_activities(df, case_id_key="case:concept:name")

print("\nAttività iniziali:", start_activities)
print("Attività finali:", end_activities)

Righe rimosse (missing o duplicati): 8289

Attività iniziali: {'Enter the ED': 1820}
Attività finali: {'Discharge from the ED': 1820}


In [None]:
# 1. Calcolo il 'start_timestamp' per ogni caso (inizio del percorso paziente)
df['start_timestamp'] = df.groupby('case:concept:name')['time:timestamp'].transform('min')

# 2. Calcolo 'Elapsed Time' (Minuti trascorsi dall'ingresso)
df['Elapsed_Time_Mins'] = (df['time:timestamp'] - df['start_timestamp']).dt.total_seconds() / 60

# 3. Calcolo 'Processing Time' (Tempo tra un'attività e la successiva)
df['Processing_Time'] = df.groupby('case:concept:name')['time:timestamp'].diff().dt.total_seconds() / 60
df['Processing_Time'] = df['Processing_Time'].fillna(0) # Il primo evento è 0

# 4. Calcolo il Lead Time totale (Durata totale della permanenza in ED)
df['lead_time'] = df.groupby('case:concept:name')['Elapsed_Time_Mins'].transform('max')

df

Unnamed: 0,case:concept:name,time:timestamp,concept:name,gender,race,arrival_transport,disposition,diagnosis_sequence,diagnosis_code,diagnosis_description,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,rhythm,drug_name,generic_drug_code,national_drug_code,reconciliation_nurse_id,drug_class_code,drug_class_classification,administering_nurse_id,start_timestamp,Elapsed_Time_Mins,Processing_Time,lead_time
0,30005370,2110-06-25 05:10:00,Enter the ED,F,WHITE,AMBULANCE,,,,,,,,,,,,,,,,,,,,,,2110-06-25 05:10:00,0.000000,0.000000,272.0
1,30005370,2110-06-25 05:10:01,Triage in the ED,,,,,,,,98.0,90.0,15.0,95.0,130.0,58.0,4,3.0,SBO,,,,,,,,,2110-06-25 05:10:00,0.016667,0.016667,272.0
2,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,ondansetron HCl [Zofran],16392.0,1.659005e+10,1.0,457.0,Antiemetic - Selective Serotonin 5-HT3 Antagon...,,2110-06-25 05:10:00,18.000000,17.983333,272.0
11,30005370,2110-06-25 05:29:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,Lovenox,27993.0,3.535600e+10,1.0,805.0,Low Molecular Weight Heparins,,2110-06-25 05:10:00,19.000000,1.000000,272.0
13,30005370,2110-06-25 05:31:00,Medicine dispensations,,,,,,,,,,,,,,,,,,Lidocaine Jelly 2% (Urojet),38861.0,,,,,1.0,2110-06-25 05:10:00,21.000000,2.000000,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25110,39999292,2110-08-15 00:48:00,Vital sign check,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,,,,,,,,,,,2110-08-15 00:48:00,0.000000,0.000000,134.0
25111,39999292,2110-08-15 00:48:01,Triage in the ED,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,3.0,Cough,,,,,,,,,2110-08-15 00:48:00,0.016667,0.016667,134.0
25112,39999292,2110-08-15 01:23:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,Prozac,46213.0,1.054406e+10,1.0,530.0,Antidepressant - Selective Serotonin Reuptake ...,,2110-08-15 00:48:00,35.000000,34.983333,134.0
25113,39999292,2110-08-15 02:57:00,Vital sign check,,,,,,,,98.0,87.0,18.0,99.0,136.0,82.0,0,,,,,,,,,,,2110-08-15 00:48:00,129.000000,94.000000,134.0


In [None]:
# FILTRO CASI INCOMPLETI

# 1. Definisco l'attività di chiusura corretta (basata sui dati visti)
END_ACTIVITY = "Discharge from the ED"

# 2. Raggruppo per caso
grouped = df.groupby("case:concept:name")
violations = []
valid_cases = []

for case_id, group_df in grouped:
    # Ordino per tempo per essere sicuro di prendere l'ultima
    group_df = group_df.sort_values("time:timestamp")

    # Prendo l'ultima attività
    last_activity = group_df["concept:name"].iloc[-1]

    # CONTROLLO: Se l'ultima attività NON è 'Discharge from the ED'
    if last_activity != END_ACTIVITY:
        violations.append({
            "case_id": case_id,
            "last_activity": last_activity
        })
    else:
        valid_cases.append(case_id)

# 3. Stampo le statistiche
num_violations = len(violations)
total_cases = len(df['case:concept:name'].unique())
perc_violations = (num_violations / total_cases) * 100

print(f"Totale casi analizzati: {total_cases}")
print(f"Casi Incompleti (Non finiscono con '{END_ACTIVITY}'): {num_violations}")
print(f"Percentuale casi incompleti: {perc_violations:.2f}%")

if num_violations > 0:
    print(pd.DataFrame(violations).head(5))

# 4. APPLICO IL FILTRO
# Mantengo solo i case_id che sono nella lista 'valid_cases'
df_clean = df[df['case:concept:name'].isin(valid_cases)]

print(f"\n--- RISULTATO FILTRO ---")
print(f"Casi rimasti nel dataset: {len(df_clean['case:concept:name'].unique())}")
df = df_clean


Totale casi analizzati: 1820
Casi Incompleti (Non finiscono con 'Discharge from the ED'): 0
Percentuale casi incompleti: 0.00%

--- RISULTATO FILTRO ---
Casi rimasti nel dataset: 1820


# scarico il dataset filtrato completo

In [None]:
output_filename = '18_12_Filtered_Log_100%.csv'
df.to_csv(output_filename, index=False)
print(f"Dataset pronto salvato come: '{output_filename}'")

Dataset pronto salvato come: '18_12_Filtered_Log_100%.csv'


In [None]:
try:
    # Genero le varianti
    variants_count = pm4py.get_variants(df, case_id_key="case:concept:name")

    # Creo DataFrame per analizzarle
    variants_df = pd.DataFrame(variants_count.items(), columns=['variant', 'count'])
    variants_df = variants_df.sort_values(by='count', ascending=False)

    # Calcolo percentuale cumulativa
    variants_df['cumulative_sum'] = variants_df['count'].cumsum()
    variants_df['cumulative_perc'] = 100 * variants_df['cumulative_sum'] / variants_df['count'].sum()

    # IMPOSTO SOGLIA A 99.5%
    THRESHOLD = 99.5
    threshold_index = variants_df[variants_df['cumulative_perc'] <= THRESHOLD].index.max()

    if pd.isna(threshold_index):
        threshold_index = len(variants_df) - 1

    # Lista delle varianti da tenere
    top_variants = variants_df.loc[:threshold_index]['variant'].tolist()

    # Filtro effettivo
    filtered_df = pm4py.filter_variants(df, top_variants)

    if len(filtered_df) > 0 and len(filtered_df) < len(df):
        print(f"Filtro Rumore Applicato (Soglia {THRESHOLD}%).")
        print(f"Casi originali: {len(df['case:concept:name'].unique())}")
        print(f"Casi dopo filtro: {len(filtered_df['case:concept:name'].unique())}")
        df = filtered_df
    else:
        print(f"Il filtro al {THRESHOLD}% non ha rimosso nulla (tutte le varianti rientrano nella soglia).")

except Exception as e:
    print(f"Errore nel filtering varianti: {e}. Mantengo il dataset completo.")

final_cases = len(df['case:concept:name'].unique())
print(f"\n--- NUMERO CASI FINALI DA ANALIZZARE: {final_cases} ---")

df

Filtro Rumore Applicato (Soglia 99.5%).
Casi originali: 1820
Casi dopo filtro: 1428

--- NUMERO CASI FINALI DA ANALIZZARE: 1428 ---


Unnamed: 0,case:concept:name,time:timestamp,concept:name,gender,race,arrival_transport,disposition,diagnosis_sequence,diagnosis_code,diagnosis_description,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,rhythm,drug_name,generic_drug_code,national_drug_code,reconciliation_nurse_id,drug_class_code,drug_class_classification,administering_nurse_id
0,30005370,2110-06-25 05:10:00,Enter the ED,F,WHITE,AMBULANCE,,,,,,,,,,,,,,,,,,,,,
1,30005370,2110-06-25 05:10:01,Triage in the ED,,,,,,,,98.0,90.0,15.0,95.0,130.0,58.0,4,3.0,SBO,,,,,,,,
2,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,ondansetron HCl [Zofran],16392.0,1.659005e+10,1.0,457.0,Antiemetic - Selective Serotonin 5-HT3 Antagon...,
3,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,omeprazole [Prilosec],43137.0,1.671407e+10,1.0,445.0,Gastric Acid Secretion Reducing Agents - Proto...,
4,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,acetaminophen [Acetaminophen Extra Strength],4490.0,1.000301e+10,1.0,577.0,Analgesic or Antipyretic Non-Opioid,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25110,39999292,2110-08-15 00:48:00,Vital sign check,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,,,,,,,,,,
25111,39999292,2110-08-15 00:48:01,Triage in the ED,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,3.0,Cough,,,,,,,,
25112,39999292,2110-08-15 01:23:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,Prozac,46213.0,1.054406e+10,1.0,530.0,Antidepressant - Selective Serotonin Reuptake ...,
25113,39999292,2110-08-15 02:57:00,Vital sign check,,,,,,,,98.0,87.0,18.0,99.0,136.0,82.0,0,,,,,,,,,,


In [None]:
output_filename = '18_12_Filtered_Log_99_5%.csv'
df.to_csv(output_filename, index=False)
print(f"Dataset pronto salvato come: '{output_filename}'")

Dataset pronto salvato come: '18_12_Filtered_Log_99_5%.csv'


# rimozione rumore 98% scartata (troppi casi rimossi, circa il 38%)

In [None]:
try:
    # Genero le varianti
    variants_count = pm4py.get_variants(df, case_id_key="case:concept:name")

    # Creo DataFrame per analizzarle
    variants_df = pd.DataFrame(variants_count.items(), columns=['variant', 'count'])
    variants_df = variants_df.sort_values(by='count', ascending=False)

    # Calcolo percentuale cumulativa
    variants_df['cumulative_sum'] = variants_df['count'].cumsum()
    variants_df['cumulative_perc'] = 100 * variants_df['cumulative_sum'] / variants_df['count'].sum()

    # Trovo l'indice per tagliare al 98%
    threshold_index = variants_df[variants_df['cumulative_perc'] <= 98].index.max()

    if pd.isna(threshold_index):
        threshold_index = len(variants_df) - 1

    # Lista delle varianti da tenere
    top_98_variants = variants_df.loc[:threshold_index]['variant'].tolist()

    # Filtro effettivo
    filtered_df = pm4py.filter_variants(df, top_98_variants)

    if len(filtered_df) > 0 and len(filtered_df) < len(df):
        print(f"Filtro Rumore Applicato.")
        print(f"Casi originali: {len(df['case:concept:name'].unique())}")
        print(f"Casi dopo filtro: {len(filtered_df['case:concept:name'].unique())}")
        df = filtered_df
    else:
        print("Il filtro non ha rimosso nulla (tutte le varianti rientrano nel 98% o dataset piccolo).")

except Exception as e:
    print(f"Errore nel filtering varianti: {e}. Mantengo il dataset completo.")

final_cases = len(df['case:concept:name'].unique())
print(f"\n--- NUMERO CASI FINALI DA ANALIZZARE: {final_cases} ---")

df

Filtro Rumore Applicato.
Casi originali: 1820
Casi dopo filtro: 1127

--- NUMERO CASI FINALI DA ANALIZZARE: 1127 ---


Unnamed: 0,case:concept:name,time:timestamp,concept:name,gender,race,arrival_transport,disposition,diagnosis_sequence,diagnosis_code,diagnosis_description,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,rhythm,drug_name,generic_drug_code,national_drug_code,reconciliation_nurse_id,drug_class_code,drug_class_classification,administering_nurse_id,start_timestamp,Elapsed_Time_Mins,Processing_Time,lead_time
0,30005370,2110-06-25 05:10:00,Enter the ED,F,WHITE,AMBULANCE,,,,,,,,,,,,,,,,,,,,,,2110-06-25 05:10:00,0.000000,0.000000,272.0
1,30005370,2110-06-25 05:10:01,Triage in the ED,,,,,,,,98.0,90.0,15.0,95.0,130.0,58.0,4,3.0,SBO,,,,,,,,,2110-06-25 05:10:00,0.016667,0.016667,272.0
2,30005370,2110-06-25 05:28:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,ondansetron HCl [Zofran],16392.0,1.659005e+10,1.0,457.0,Antiemetic - Selective Serotonin 5-HT3 Antagon...,,2110-06-25 05:10:00,18.000000,17.983333,272.0
11,30005370,2110-06-25 05:29:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,Lovenox,27993.0,3.535600e+10,1.0,805.0,Low Molecular Weight Heparins,,2110-06-25 05:10:00,19.000000,1.000000,272.0
13,30005370,2110-06-25 05:31:00,Medicine dispensations,,,,,,,,,,,,,,,,,,Lidocaine Jelly 2% (Urojet),38861.0,,,,,1.0,2110-06-25 05:10:00,21.000000,2.000000,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25110,39999292,2110-08-15 00:48:00,Vital sign check,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,,,,,,,,,,,2110-08-15 00:48:00,0.000000,0.000000,134.0
25111,39999292,2110-08-15 00:48:01,Triage in the ED,,,,,,,,97.9,117.0,18.0,99.0,162.0,84.0,0,3.0,Cough,,,,,,,,,2110-08-15 00:48:00,0.016667,0.016667,134.0
25112,39999292,2110-08-15 01:23:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,Prozac,46213.0,1.054406e+10,1.0,530.0,Antidepressant - Selective Serotonin Reuptake ...,,2110-08-15 00:48:00,35.000000,34.983333,134.0
25113,39999292,2110-08-15 02:57:00,Vital sign check,,,,,,,,98.0,87.0,18.0,99.0,136.0,82.0,0,,,,,,,,,,,2110-08-15 00:48:00,129.000000,94.000000,134.0


# dataset del 98% scaricato in precedenza ma non usato
