In [None]:
import os
import re
import numpy as np
import torch
import pandas as pd
from skimage.transform import resize

# --- Load the AIRC database and build the mapping (Cell 1) ---
# Define the path to your AIRC CSV.
mapping_csv = '/home/user/Projects/thesis/entropic_project/DB_AIRCl.csv'
# Load the CSV, skipping the first 2 rows (data starts at row 3).
df = pd.read_csv(mapping_csv)


# Assume:
# Column 0: Patient ID, Column 1: Full patient name.
def extract_last_name(full_name):
    if ',' in full_name:
        return full_name.split(',')[0].strip().lower()
    else:
        return full_name.split()[0].strip().lower()

df['patient_id'] = df.iloc[:, 0].astype(str).str.strip()
df['patient_name'] = df.iloc[:, 1].astype(str).str.strip()
df['last_name'] = df['patient_name'].apply(extract_last_name)

print("Patient Mapping DataFrame:")
print(df[['patient_id', 'patient_name', 'last_name']].head())
print(df)

# Create primary mapping: last_name -> patient_id
last_name_to_id = {row['last_name']: row['patient_id'] for _, row in df.iterrows()}

# --- Parameters ---
kernel_radius_values = [1, 2, 3, 4]
base_extraction_dir = '/home/user/Projects/thesis/entropic_project/extraction_results_partial/extraction_results_kernelRadius_'
downscale_factor = 4.5  # 4.5 is the limit for my CPU

# --- Process files in an extraction folder ---
# (Example for one kernelRadius; you can loop over kernel_radius_values as needed.)
extraction_folder = f"{base_extraction_dir}1"  # For example, kernelRadius=1
print(f"\n=== Processing extraction folder: {extraction_folder} ===")

tensors = []
patient_ids = []

for file_name in sorted(os.listdir(extraction_folder)):
    if file_name.endswith('.npy'):
        file_path = os.path.join(extraction_folder, file_name)
        # Use regex to extract a string of letters from the file name.
        m = re.search(r'([a-zA-Z]+)', file_name)
        if m:
            extracted_name = m.group(1).lower()
        else:
            extracted_name = None

        # First, try primary mapping.
        if extracted_name and extracted_name in last_name_to_id:
            pid = last_name_to_id[extracted_name]
        else:
            # Fallback: search the full patient_name column for a match.
            if extracted_name:
                candidates = df[df['patient_name'].str.lower().str.contains(r'\b' + re.escape(extracted_name))]
                if not candidates.empty:
                    pid = candidates.iloc[0]['patient_id']
                    print(f"File '{file_name}': extracted '{extracted_name}' matched full name '{candidates.iloc[0]['patient_name']}' with id {pid}")
                else:
                    pid = "Unknown"
                    print(f"File '{file_name}': extracted '{extracted_name}' not found in full names.")
            else:
                pid = "Unknown"
                print(f"File '{file_name}': could not extract a name.")
        
        patient_ids.append(pid)
        
        # Load the tensor.
        tensor = np.load(file_path)
        # Downscale the tensor.
        new_shape = tuple(dim // downscale_factor for dim in tensor.shape)
        tensor_resized = resize(tensor, new_shape, mode='constant', preserve_range=True, anti_aliasing=True)
        tensors.append(torch.tensor(tensor_resized, dtype=torch.float32))

print(f"Loaded {len(tensors)} tensors for {len(patient_ids)} patients.")
print("Assigned Patient IDs:")
print(patient_ids)

FileNotFoundError: [Errno 2] No such file or directory: '/home/user/Projects/thesis/entropic_project/DB_AIRCl.csv'

In [None]:
import numpy as np
import pandas as pd
from lifelines.statistics import multivariate_logrank_test
import torch
import torch.nn.functional as F
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn_extra.cluster import KMedoids
import os
import math
from scipy.linalg import eig

# --- Clustering Parameters ---
sigma_values = [1, 2, 3]
num_clusters_list = [2, 3, 4, 5]
dbscan_eps = 1
dbscan_min_samples = 3

clustering_save_dir = "/home/user/Projects/thesis/entropic_project/clustering_results"
if not os.path.exists(clustering_save_dir):
    os.makedirs(clustering_save_dir)

# Helper functions: TensorProduct and IMED (if not already defined)
def TensorProduct(A, B, C):
    return np.kron(np.kron(A, B), C)

def IMED(l1, l2, l3, sigma):
    square_sigma = sigma ** 2
    G1 = np.exp(-np.square(np.arange(l1)[:, None] - np.arange(l1)) / (2 * square_sigma))
    G2 = np.exp(-np.square(np.arange(l2)[:, None] - np.arange(l2)) / (2 * square_sigma))
    G3 = np.exp(-np.square(np.arange(l3)[:, None] - np.arange(l3)) / (2 * square_sigma))
    Gtensor = TensorProduct(G3, G1, G2) * (1 / (2 * math.pi * square_sigma))
    vals1, vecs1 = eig(G1)
    vals2, vecs2 = eig(G2)
    vals3, vecs3 = eig(G3)
    Lambda1 = np.diag(vals1.real)
    Lambda2 = np.diag(vals2.real)
    Lambda3 = np.diag(vals3.real)
    Lambdatensor = TensorProduct(Lambda3, Lambda1, Lambda2)
    root_Omega = np.sqrt(Lambdatensor)
    root_G = math.sqrt(1 / (2 * math.pi * square_sigma)) * (
        TensorProduct(vecs3, vecs1, vecs2) @ root_Omega @ TensorProduct(vecs3, vecs1, vecs2).T
    )
    return Gtensor, root_G

# Create a list to store statistical tests results
statistical_tests_results = []

# --- Main Clustering Loop ---
for kr in kernel_radius_values:
    # Filter the DataFrame for the current kernel radius.
    
    print(f"\nKernelRadius {kr}: Loaded {len(tensors)} tensors for {len(patient_ids)} patients.")
    
    # Pad tensors to a uniform shape.
    max_shape = torch.tensor([tensor.shape for tensor in tensors]).max(dim=0).values
    padded_tensors = []
    for tensor in tensors:
        pad_width = [max_shape[i] - tensor.shape[i] for i in range(len(tensor.shape))]
        pad_width_reversed = []
        for width in pad_width[::-1]:
            pad_width_reversed.extend([0, width])
        padded_tensor = F.pad(tensor, pad_width_reversed, mode="constant", value=0)
        padded_tensors.append(padded_tensor)
    print(f"Padded tensor shape: {padded_tensors[0].shape}")
    
    # Flatten padded tensors into a 2D matrix.
    n_paz = len(padded_tensors)
    vector_length = torch.prod(torch.tensor(padded_tensors[0].shape)).item()
    v = torch.zeros((n_paz, vector_length), dtype=torch.float32)
    for i, tensor in enumerate(padded_tensors):
        v[i] = tensor.flatten()
    
    # Loop over sigma values.
    for sigma in sigma_values:
        print(f"  --- Sigma: {sigma} ---")
        l1, l2, l3 = padded_tensors[0].shape
        _, rootG = IMED(l1, l2, l3, sigma)
        rootG = torch.tensor(rootG, dtype=torch.float32)
        
        # Standardize: apply the transformation.
        u = torch.zeros_like(v)
        for i in range(n_paz):
            u[i] = torch.matmul(rootG, v[i])
        u_np = u.numpy()
        
        # Compute Euclidean distance matrix.
        distance_matrix = pairwise_distances(u_np, metric="euclidean")
        print(f"Distance matrix shape: {distance_matrix.shape}")
        np.save(f"/home/user/Projects/thesis/entropic_project/clustering_results/distance_matrix_kRad_{kr}_sigma_{sigma}.npy", distance_matrix)
        print("Distance matrix saved.")
        
        # Run DBSCAN (once per (kr, sigma)).
        dbscan_model = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples, metric='precomputed')
        labels_dbscan = dbscan_model.fit_predict(distance_matrix)
        
        # Loop over target number of clusters.
        for n_clusters in num_clusters_list:
            print(f"    >> Clusters: {n_clusters}")
            # Agglomerative Clustering.
            agglo = AgglomerativeClustering(n_clusters=n_clusters, metric='precomputed', linkage='average')
            labels_agglo = agglo.fit_predict(distance_matrix)
            
            # KMeans Clustering 
            kmeans = KMeans(n_clusters=n_clusters, n_init=30, random_state=42)
            labels_kmeans = kmeans.fit_predict(distance_matrix)
            
            # KMedoids Clustering.
            kmedoids = KMedoids(n_clusters=n_clusters, metric='precomputed', random_state=42)
            labels_kmedoids = kmedoids.fit_predict(distance_matrix)
            
            # Aggregate clustering results including patient IDs.
            clustering_result = {
                "patient_id": patient_ids,
                "Agglomerative": labels_agglo,
                "KMeans": labels_kmeans,
                "KMedoids": labels_kmedoids,
                "DBSCAN": labels_dbscan  # DBSCAN labels remain constant for this (kr, sigma)
            }
            
            # Convert to DataFrame.
            clustering_df = pd.DataFrame(clustering_result)
            print("Clustering DataFrame:")
            print(clustering_df)
            print("CLINICAL DATAFRAME:")
            print(df)
            

            # Merge with the original DataFrame.
            merged_df = df.merge(clustering_df, on="patient_id")
            print("Merged DataFrame:")
            print(merged_df)


            result_filename = f"clustering_results_kRad_{kr}_sigma_{sigma}_nClusters_{n_clusters}.npy"
            result_filepath = os.path.join(clustering_save_dir, result_filename)
            np.save(result_filepath, merged_df)
            merged_df.to_csv(f"/home/user/Projects/thesis/entropic_project/clustering_results/clustering_results_kRad_{kr}_sigma_{sigma}_nClusters_{n_clusters}.csv", index=False)
            print(f"      Saved clustering result: {result_filepath}")
            print(merged_df)

PermissionError: [Errno 13] Permission denied: '/home/user'