# Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier, Pool

In [None]:
# Import MFF Toolkit
from ecg.data_processing import load_raw_ecg_data, load_and_process_ptb_xl_data, get_NORM_AF_data, oversample_data
from ecg.training import train_model, extract_feature_vectors, ptb_xl_ecg_labeling
from ecg.models import ECGModel, EGMModel
from ablation.excel_processing import process_excel_files
from ablation.data_loading import load_ablation_data, summarise_ablation_data
from synthetic.synthetic_data import generate_synthetic_data, generate_synthetic_ecg_from_ptb_xl, generate_synthetic_egm, create_labels
from integration.data_fusion import normalise_and_OHE_data, combine_data
from evaluation.test_model_performance import cross_validate_model, compute_confidence_interval, plot_roc_and_pr_curves

# 1. ECG data import and pre-processing

## 1.1 Define paths and global variables

In [None]:
path_to_ptb_xl = ""
input_folder_ablation_data = ""
output_folder_ablation_data = ""
sampling_rate_ecg=400

## 1.2 Load ECG data from Excel file

In [None]:
Y = load_and_process_ptb_xl_data(path_to_ptb_xl)
Y_norm, Y_af = get_NORM_AF_data(Y)


In [None]:
# Concatenate NORM and AF data to create a pre-training dataset
Y_ptb_xl = pd.concat([Y_norm, Y_af])

# Random arrangement of the data
Y_ptb_xl = Y_ptb_xl.sample(frac=1, random_state=42).reset_index(drop=True)

# Load 12-lead ecg data of NORM and AF cases
X_ptb_xl = load_raw_ecg_data(Y_ptb_xl, sampling_rate_ecg, path_to_ptb_xl)

# Resampling the PTB-XL data for pre-training
ptb_xl_X_res, ptb_xl_y_res = oversample_data(X_ptb_xl, Y_ptb_xl["Label"])

# Resampling results
print(ptb_xl_X_res.shape)
print(ptb_xl_y_res.shape)

# 2. Create and pre-train ECG and EGM model

## 2.1 Load models and define DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ecg_model = ECGModel().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(ecg_model.parameters(), lr=0.001)

ptb_xl_X_tensor = torch.tensor(ptb_xl_X_res, dtype=torch.float32)
ptb_xl_y_tensor = torch.tensor(ptb_xl_ecg_labeling(ptb_xl_y_res), dtype=torch.long)
ptb_xl_dataset = TensorDataset(ptb_xl_X_tensor, ptb_xl_y_tensor)

train_size = int(0.8 * len(ptb_xl_dataset))  # 80% train, 20% val
val_size = len(ptb_xl_dataset) - train_size
ptb_xl_train_dataset, ptb_xl_val_dataset = random_split(ptb_xl_dataset, [train_size, val_size])

ptb_xl_train_dataloader = DataLoader(ptb_xl_train_dataset, batch_size=32, shuffle=True)
ptb_xl_val_dataloader = DataLoader(ptb_xl_val_dataset, batch_size=32, shuffle=False)

## 2.2 Pre-train ECG model

In [None]:
train_model(ecg_model, device, ptb_xl_train_dataloader, ptb_xl_val_dataloader, criterion, optimizer, num_epochs=25, patience=3)

## 2.3 Pre-train egm model

In [None]:
# Load egm model here 

# 3. Create ablation data

## 3.1 Preprocessing ablation data

In [None]:
# Process the given excel files. Replace all commas that are in decimal places with dots.
process_excel_files(input_folder_ablation_data, output_folder_ablation_data)

In [None]:
# Extract the ablation data from the converted files
temp_data, pressure_data, flow_data = load_ablation_data(output_folder_ablation_data, window=11, polyorder=2, delta=0.5)

In [None]:
sum_data = summarise_ablation_data(temp_data)
synthetic_data = generate_synthetic_data(sum_data, num_patients=499, random_state=42)
ablation_data = pd.concat([sum_data, synthetic_data], ignore_index=True)

In [None]:
ablation_data

# 4. Create synthetic data

## 4.1 Generate synthetic surface ECG data by selecting some AF cases from PTB-XL (for testing only)

In [None]:
df_synt_ptb_xl_ecgs = generate_synthetic_ecg_from_ptb_xl(Y_af, len(ablation_data))
X_synt_ptb_xl_ecgs = load_raw_ecg_data(df_synt_ptb_xl_ecgs, sampling_rate_ecg, path_to_ptb_xl)

## 4.2 Generate synthetic intracardial ECG (EGM) data (for testing only)

In [None]:
synt_egm_singals = generate_synthetic_egm(ablation_data, total_time=250, sampling_rate=1000, no_effect_ratio=0.5, min_effect=0.1, max_effect=0.9)

## 4.3 Create labels

In [None]:
ablation_labels = create_labels(synt_egm_singals, ablation_data, one_hot_encode=True)

# 5. Train models

## 5.1 Train ECG model and extract features

In [None]:
ptb_xl_synt_X_tensor = torch.tensor(X_synt_ptb_xl_ecgs, dtype=torch.float32)
ptb_xl_synt_y_tensor = torch.tensor(ablation_labels, dtype=torch.long)
ptb_xl_synt_dataset = TensorDataset(ptb_xl_synt_X_tensor, ptb_xl_synt_y_tensor)

train_size = int(0.8 * len(ptb_xl_synt_dataset))  # 80% train, 20% val
val_size = len(ptb_xl_synt_dataset) - train_size
ptb_xl_synt_train_dataset, ptb_xl_synt_val_dataset = random_split(ptb_xl_synt_dataset, [train_size, val_size])

ptb_xl_synt_train_dataloader = DataLoader(ptb_xl_synt_train_dataset, batch_size=32, shuffle=True)
ptb_xl_synt_val_dataloader = DataLoader(ptb_xl_synt_val_dataset, batch_size=32, shuffle=False)

In [None]:
train_model(ecg_model, device, ptb_xl_synt_train_dataloader, ptb_xl_synt_val_dataloader, criterion, optimizer, num_epochs=25, patience=3)

In [None]:
# Extract feature vectors ECG
ptb_xl_synt_dataloader_vec = DataLoader(ptb_xl_synt_dataset, batch_size=32, shuffle=False)
feat_vector_ecg = extract_feature_vectors(ecg_model, device, ptb_xl_synt_dataloader_vec)

## 5.1 Train EGM model and extract features

In [None]:
# Reshape egm data for egm model
ecg_signals_array = np.array([signal[1] for signal in synt_egm_singals])
ecg_signals_array = ecg_signals_array.reshape(len(ablation_data), 250000, 1)

In [None]:
egm_model = EGMModel().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(egm_model.parameters(), lr=0.001)

X_egm_tensor = torch.tensor(ecg_signals_array, dtype=torch.float32)
y_egm_tensor = torch.tensor(ablation_labels, dtype=torch.long)
y_egm_dataset = TensorDataset(X_egm_tensor, y_egm_tensor)

train_size = int(0.8 * len(y_egm_dataset))  # 80% train, 20% val
val_size = len(y_egm_dataset) - train_size
egm_train_dataset, egm_val_dataset = random_split(y_egm_dataset, [train_size, val_size])

emg_train_dataloader = DataLoader(egm_train_dataset, batch_size=32, shuffle=True)
emg_val_dataloader = DataLoader(egm_val_dataset, batch_size=32, shuffle=False)

In [None]:
train_model(egm_model, device, emg_train_dataloader, emg_val_dataloader, criterion, optimizer, num_epochs=25, patience=3)

In [None]:
# Extract feature vector EGM
synt_egm_dataloader_vec = DataLoader(y_egm_dataset, batch_size=32, shuffle=False)
feat_vector_egm = extract_feature_vectors(egm_model, device, synt_egm_dataloader_vec)

# 6. Fusion model

## 6.1 Normalise data

In [None]:
continuous_vars = [
    'AblationCount', 'NadirTemperature_mean', 'NadirTemperature_median', 'NadirTemperature_var',
    'AblationTime_mean', 'AblationTime_median', 'AblationTime_var',
    't_end_mean', 't_end_median', 't_end_var'
]

categorical_vars = ['Gender', 'RSPV', 'LIPV', 'LSPV', 'RIPV', 'Diagnosis']

norm_ablation_data = normalise_and_OHE_data(ablation_data, continuous_vars, categorical_vars)

## 6.2 Combine all data

In [None]:
final_df = combine_data(pd.DataFrame(feat_vector_ecg), pd.DataFrame(feat_vector_egm), None, norm_ablation_data.drop(columns=["PatientId"]), axis=1)

## 6.3 Train CatBoost model and perform cross validation

In [None]:
labels, preds, pred_probs, roc_aucs, all_confusion_matrices, cm_scores = cross_validate_model(np.array(final_df), np.array(np.argmax(ablation_labels, axis=1)), n_splits=10)

## 6.4 Feature Importance

In [None]:
X_fi_train, X_fi_test, y_fi_train, y_fi_test = train_test_split(np.array(final_df), np.array(np.argmax(ablation_labels, axis=1)), test_size=0.2, random_state=42)

train_pool = Pool(data=X_fi_train, label=y_fi_train)
test_pool = Pool(data=X_fi_test, label=y_fi_test)

catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=2, verbose=10)
catboost_model.fit(train_pool)

y_pred = catboost_model.predict(test_pool)
y_pred_prob = catboost_model.predict_proba(test_pool)[:, 1]

# Classification report
print("Accuracy:", accuracy_score(y_fi_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_fi_test, y_pred))

In [None]:
# Call Feature Importances
feature_importances = catboost_model.get_feature_importance(train_pool)

# Ensure feature names
feature_names = np.array(final_df.columns, dtype=str)

# Define indices for the groups
group_1_indices = np.arange(0, 128)
group_2_indices = np.arange(128, 256)
other_indices = np.arange(256, len(feature_names))

# Calculate aggregated feature importances
group_1_importance = np.sum(feature_importances[group_1_indices])
group_2_importance = np.sum(feature_importances[group_2_indices])

# Directly adopt feature importances for the remaining features
other_importances = feature_importances[other_indices]

# Labels for the diagram
final_feature_names = ["ECG", "EGM"] + list(feature_names[other_indices])
final_importances = [group_1_importance, group_2_importance] + list(other_importances)

# Plot diagram
plt.figure(figsize=(5, 6))
plt.barh(final_feature_names, final_importances, align='center')
plt.xlabel("Feature Importance")

plt.show()