In [None]:
import os
from locpix_points.scripts.visualise import visualise_torch_geometric
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import mplcursors
import numpy as np
import open3d as o3d
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import umap
import yaml

In [None]:
def generate_umap_embedding(X, min_dist, n_neighbours):
    """Run UMAP
    
    Args:
        X (array): Array to fit to
        min_dist (float): Distance for umap
        n_neighbours (int): n-neighbours for umap"""

    reducer = umap.UMAP(
            min_dist=min_dist,
            n_neighbors=n_neighbours,
    )
    embedding = reducer.fit_transform(X)

    return embedding

def visualise_umap_embedding(embedding, df, label_map):
    """Visualise UMAP results
    
    Args:
        embedding (array): UMAP embedding
        df (dataframe): Dataframe with data in
        label_map (dict): Map from numbers to concepts"""

    # Plot UMAP - per cluster
    plt.close('all')
    %matplotlib widget
    plt.scatter(
        embedding[:, 0],
        embedding[:, 1],
        c=[sns.color_palette()[x] for x in df.type.map(label_map)],
        label=[x for x in df.type.map(label_map)],
        s=5,
    )
    num_classes = len(label_map.keys())
    patches = [
        mpatches.Patch(color=sns.color_palette()[i], label=list(label_map.keys())[i])
        for i in range(num_classes)
    ]
    cursor = mplcursors.cursor(hover=False)
    cursor.connect(
        "add", lambda sel: sel.annotation.set_text(f"{df.file_name[sel.index]}")
    )
    plt.legend(handles=patches)
    plt.gca().set_aspect("equal", "datalim")
    plt.title("UMAP projection of the dataset", fontsize=24)
    plt.show()


In [None]:
def generate_pca_embedding(X, n_components):
    """Run PCA
    
    Args:
        X (array): Array to fit to
        n_components (int): n-components for pca"""

    # transform via PCA
    reduced_data = PCA(n_components=n_components).fit_transform(X)

    return reduced_data

def visualise_pca_embedding(pca_embedding, df, label_map):
    """Visualise PCA embedding
    
    Args:
        pca_embedding (array): PCA embedded data
        df (dataframe): Dataframe with data in
        label_map (dict): Map from numbers to concepts"""
    
    n_classes = len(label_map.keys())

    # convert 2d to 3d if required for plotting
    if pca_embedding.shape[1] == 2:
        z = np.ones(pca_embedding.shape[0])
        z = np.expand_dims(z, axis=1)
        pca_embedding = np.concatenate([pca_embedding, z], axis=1)

    # colour clusters according to class
    colors = np.zeros((len(pca_embedding), 3))
    for cls in range(n_classes):
        idx = np.argwhere(df.type.map(label_map) == cls)
        colors[idx] = sns.color_palette()[cls]
        class_label = list(label_map.keys())[list(label_map.values()).index(cls)]
        print(f"Class {class_label} is RGB colour: {sns.color_palette()[cls]}", flush=True)

    # plot clusters in o3d
    point_cloud = o3d.geometry.PointCloud()
    point_cloud.points = o3d.utility.Vector3dVector(pca_embedding)
    point_cloud.colors = o3d.utility.Vector3dVector(colors)

    # visualise
    _ = o3d.visualization.Visualizer()
    o3d.visualization.draw_geometries([point_cloud])


In [None]:
def k_means_fn(X, df, label_map):
    """Run KMeans
    
    Args:
        X (array): Array to fit to
        df (dataframe): Dataframe with data in
        label_map (dict): Map from numbers to concepts"""

    n_clusters = len(label_map.keys())
    y_true = df.type.map(label_map).to_numpy()

    # with PCA reduction
    reduced_data = PCA(n_components=2).fit_transform(X)
    kmeans = KMeans(init="k-means++", n_clusters=n_clusters)
    kmeans.fit(reduced_data)
    y_pred = kmeans.labels_

    print("--- K means report (with PCA reduction to 2D) ---")
    print(classification_report(y_true, y_pred))

    # without PCA reduction
    kmeans = KMeans(init="k-means++", n_clusters=n_clusters)
    kmeans.fit(X)
    y_pred = kmeans.labels_

    print("--- K means report (NO PCA reduction) ---")
    print(classification_report(y_true, y_pred))

## Analysis notebook

In [None]:
project_directory = ".."
# load config
with open(os.path.join(project_directory, "config/featanalyse_manual.yaml"), "r") as ymlfile:
    config_manual = yaml.safe_load(ymlfile)
with open(os.path.join(project_directory, "config/featanalyse_nn.yaml"), "r") as ymlfile:
    config_nn = yaml.safe_load(ymlfile)
label_map = config_manual["label_map"]
assert label_map == config_nn["label_map"]
manual_features = config_manual["features"]

### Parameters

In [None]:
final_test = False
umap_n_neighbours = 20
umap_min_dist = 0.5
pca_n_components = 2

### Analyse the manual features

In [None]:
train_df = os.path.join(project_directory, "output/train_df_manual.csv")
train_df = pl.read_csv(train_df)
train_df_pd = train_df.to_pandas()

if final_test:
    test_df = os.path.join(project_directory, "output/test_df_manual.csv")
    test_df = pl.read_csv(test_df)
    test_df_pd = test_df.to_pandas()

#### Compare PCA vs Convex hull

In [None]:

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
sns.scatterplot(data=train_df_pd, x = "length_pca", y="length_convex_hull",s=5, ax=ax1)
sns.scatterplot(data=train_df_pd, x = "area_pca", y="area_convex_hull",s=5, ax=ax2)
plt.show()

if final_test:
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
    sns.scatterplot(data=test_df_pd, x = "length_pca", y="length_convex_hull",s=5, ax=ax1)
    sns.scatterplot(data=test_df_pd, x = "area_pca", y="area_convex_hull",s=5, ax=ax2)
    plt.show()

#### Cluster features boxplots

In [None]:
# number of clusters per FOV, cluster type, ...
train_cluster_counts = train_df["file_name"].value_counts()
print("Number of clusters per FOV:", train_cluster_counts)
# number of clusters in each class
train_type_counts = train_df["type"].value_counts()
print("Number of clusters in each class:", train_type_counts)

# per fov features grouped by mean with std
train_df_boxplot = train_df[manual_features + ["type", "file_name"]].to_pandas()

fig, axs = plt.subplots(2,4, figsize=(20,8))
assert len(manual_features) == 8
counter = 0
for feat in manual_features:
    sns.boxplot(x='type', y=feat, data=train_df_boxplot, color='k', fill=False, flierprops=dict(marker='x', markersize=5), ax=axs[counter//4][counter%4])
    counter += 1
plt.show()


if final_test:
    # number of clusters per FOV, cluster type, ...
    test_cluster_counts = test_df["file_name"].value_counts()
    print("Number of clusters per FOV:", test_cluster_counts)
    # number of clusters in each class
    test_type_counts = test_df["type"].value_counts()
    print("Number of clusters in each class:", test_type_counts)

    # per fov features grouped by mean with std
    test_df_boxplot = test_df[manual_features + ["type", "file_name"]].to_pandas()

    fig, axs = plt.subplots(2,4, figsize=(20,8))
    assert len(manual_features) == 8
    counter = 0
    for feat in manual_features:
        sns.boxplot(x='type', y=feat, data=test_df_boxplot, color='k', fill=False, flierprops=dict(marker='x', markersize=5), ax=axs[counter//4][counter%4])
        counter += 1
    plt.show()
#

#### Set-up UMAP/PCA/K-means

In [None]:
# get features present in the dataframe
not_features = ["clusterID", "x_mean", "y_mean", "type", "file_name"]
features = [x for x in train_df.columns if x not in not_features]

# now remove features not selected by user
removed_features = [f for f in features if f not in manual_features]
print("Removed features: ", removed_features)
features = [f for f in features if f in manual_features]
print("Features analysed: ", features)

# feature vector
train_data_feats = train_df_pd[features].values
if final_test:
    test_data_feats = test_df_pd[features].values

num_features = len(train_data_feats[0])
print("Num features: ", num_features)
############ WARNING ##############
# Be careful, if analysing neural net features
# Is this the number of features you expect
# Did this task use manual features as well

scaler = StandardScaler().fit(train_data_feats)
X_train = scaler.transform(train_data_feats)
if final_test:
    X_test = scaler.transform(test_data_feats)

#### UMAP

In [None]:
train_umap_embedding = generate_umap_embedding(X_train, umap_min_dist, umap_n_neighbours)
if final_test:
    test_umap_embedding = generate_umap_embedding(X_test, umap_min_dist, umap_n_neighbours)

In [None]:
visualise_umap_embedding(train_umap_embedding, train_df_pd, label_map)
if final_test:
    visualise_umap_embedding(test_umap_embedding, test_df_pd, label_map)

#### PCA

In [None]:
train_pca_embedding = generate_pca_embedding(X_train, pca_n_components)
if final_test:
    test_pca_embedding = generate_pca_embedding(X_test, pca_n_components)


In [None]:
visualise_pca_embedding(train_pca_embedding, train_df_pd, label_map)
if final_test:
    visualise_pca_embedding(test_pca_embedding, test_df_pd, label_map)

#### K-means

In [None]:
k_means_fn(X_train, train_df_pd, label_map)
if final_test:
    k_means_fn(X_test, test_df_pd, label_map)

### Analyse the nn features

In [None]:
train_df_nn_loc = os.path.join(project_directory, "output/train_df_nn_loc.csv")
train_df_nn_loc = pd.read_csv(train_df_nn_loc)

train_df_nn_cluster = os.path.join(project_directory, "output/train_df_nn_cluster.csv")
train_df_nn_cluster = pd.read_csv(train_df_nn_cluster)

train_df_nn_fov = os.path.join(project_directory, "output/train_df_nn_fov.csv")
train_df_nn_fov = pd.read_csv(train_df_nn_fov)

if final_test:  
    test_df_nn_loc = os.path.join(project_directory, "output/test_df_nn_loc.csv")
    test_df_nn_loc = pd.read_csv(test_df_nn_loc)

    test_df_nn_cluster = os.path.join(project_directory, "output/test_df_nn_cluster.csv")
    test_df_nn_cluster = pd.read_csv(test_df_nn_cluster)

    test_df_nn_fov = os.path.join(project_directory, "output/test_df_nn_fov.csv")
    test_df_nn_fov = pd.read_csv(test_df_nn_fov)

else:
    test_df_nn_loc = None
    test_df_nn_cluster = None
    test_df_nn_fov = None


#### Set-up UMAP/PCA/K-means

In [None]:
def prep_features(train_df, test_df):

    # get features present in the dataframe
    not_features = ["type", "file_name"]
    features = [x for x in train_df.columns.to_list() if x not in not_features]

    # feature vector
    train_data_feats_nn = train_df[features].values
    if final_test:
        test_data_feats_nn = test_df[features].values

    num_features = len(train_data_feats_nn[0])
    print("Num features: ", num_features)
    ############ WARNING ##############
    # Be careful, if analysing neural net features
    # Is this the number of features you expect
    # Did this task use manual features as well

    scaler = StandardScaler().fit(train_data_feats_nn)
    X_train_nn = scaler.transform(train_data_feats_nn)
    if final_test:
        X_test_nn = scaler.transform(test_data_feats_nn)
        
        return X_train_nn, X_test_nn
    else:
        return X_train_nn, None

X_train_nn_loc, X_test_nn_loc = prep_features(train_df_nn_loc, test_df_nn_loc)
X_train_nn_cluster, X_test_nn_cluster = prep_features(train_df_nn_cluster, test_df_nn_cluster)
X_train_nn_fov, X_test_nn_fov = prep_features(train_df_nn_fov, test_df_nn_fov)

#### UMAP

In [None]:
train_umap_embedding_nn_loc = generate_umap_embedding(X_train_nn_loc, umap_min_dist, umap_n_neighbours)
train_umap_embedding_nn_cluster = generate_umap_embedding(X_train_nn_cluster, umap_min_dist, umap_n_neighbours)
train_umap_embedding_nn_fov = generate_umap_embedding(X_train_nn_fov, umap_min_dist, umap_n_neighbours)
if final_test:
    test_umap_embedding_nn_loc = generate_umap_embedding(X_test_nn_loc, umap_min_dist, umap_n_neighbours)
    test_umap_embedding_nn_cluster = generate_umap_embedding(X_test_nn_cluster, umap_min_dist, umap_n_neighbours)
    test_umap_embedding_nn_fov = generate_umap_embedding(X_test_nn_fov, umap_min_dist, umap_n_neighbours)

In [None]:
print("------ LOC ENCODER -------")
visualise_umap_embedding(train_umap_embedding_nn_loc, train_df_nn_loc, label_map)
if final_test:
    visualise_umap_embedding(test_umap_embedding_nn_loc, test_df_nn_loc, label_map)


In [None]:
print("------ CLUSTER ENCODER -------")
visualise_umap_embedding(train_umap_embedding_nn_cluster, train_df_nn_cluster, label_map)
if final_test:
    visualise_umap_embedding(test_umap_embedding_nn_cluster, test_df_nn_cluster, label_map)


In [None]:
print("------ FOV ENCODER -------")
visualise_umap_embedding(train_umap_embedding_nn_fov, train_df_nn_fov, label_map)
if final_test:
    visualise_umap_embedding(test_umap_embedding_nn_fov, test_df_nn_fov, label_map)

#### PCA

In [None]:
train_pca_embedding_nn_loc = generate_pca_embedding(X_train_nn_loc, pca_n_components)
train_pca_embedding_nn_fov = generate_pca_embedding(X_train_nn_fov, pca_n_components)
train_pca_embedding_nn_cluster = generate_pca_embedding(X_train_nn_cluster, pca_n_components)
if final_test:
    test_pca_embedding_nn_loc = generate_pca_embedding(X_test_nn_loc, pca_n_components)
    test_pca_embedding_nn_fov = generate_pca_embedding(X_test_nn_fov, pca_n_components)
    test_pca_embedding_nn_cluster = generate_pca_embedding(X_test_nn_cluster, pca_n_components)

In [None]:
print("------ LOC ENCODER -------")
visualise_pca_embedding(train_pca_embedding_nn_loc, train_df_nn_loc, label_map)
if final_test:
    visualise_pca_embedding(test_pca_embedding_nn_loc, test_df_nn_loc, label_map)

In [None]:
print("------ CLUSTER ENCODER -------")
visualise_pca_embedding(train_pca_embedding_nn_cluster, train_df_nn_cluster, label_map)
if final_test:
    visualise_pca_embedding(test_pca_embedding_nn_cluster, test_df_nn_cluster, label_map)


In [None]:
print("------ FOV ENCODER -------")
visualise_pca_embedding(train_pca_embedding_nn_fov, train_df_nn_fov, label_map)
if final_test:
    visualise_pca_embedding(test_pca_embedding_nn_fov, test_df_nn_fov, label_map)

#### K-means

In [None]:
print("----- LOC ------")
k_means_fn(X_train_nn_loc, train_df_nn_loc, label_map)
if final_test:
    k_means_fn(X_test_nn_loc, test_df_nn_loc, label_map)

print("----- CLUSTER ------")
k_means_fn(X_train_nn_cluster, train_df_nn_cluster, label_map)
if final_test:
    k_means_fn(X_test_nn_cluster, test_df_nn_cluster, label_map)

print("----- FOV ------")
k_means_fn(X_train_nn_fov, train_df_nn_fov, label_map)
if final_test:
    k_means_fn(X_test_nn_fov, test_df_nn_fov, label_map)


#### SubgraphX

#### PgEx

#### Attention

### Statistical tests

### Visualise file

In [None]:
file_name = "wo_3356"
file_folder = "fold_0"

# visualise it
train_file_map_path = os.path.join(project_directory, f"processed/{file_folder}/train/file_map.csv")
val_file_map_path = os.path.join(project_directory, f"processed/{file_folder}/val/file_map.csv")
test_file_map_path = os.path.join(project_directory, f"processed/{file_folder}/test/file_map.csv")

train_file_map = pd.read_csv(train_file_map_path)
val_file_map = pd.read_csv(val_file_map_path)
test_file_map = pd.read_csv(test_file_map_path)

train_out = train_file_map[train_file_map["file_name"] == file_name]
val_out = val_file_map[val_file_map["file_name"] == file_name]
test_out = test_file_map[test_file_map["file_name"] == file_name]

if len(train_out) > 0:
    folder = "train"
    file_name = train_out["idx"].values[0]
if len(val_out) > 0:
    folder = "val"
    file_name = val_out["idx"].values[0]
if len(test_out) > 0:
    folder = "test"
    file_name = test_out["idx"].values[0]

file_loc = os.path.join(project_directory, f"processed/{file_folder}/{folder}/{file_name}.pt")
visualise_torch_geometric(file_loc)


## To do

Wed
0. Check through analysis notebook and commit changes 
1. Run new model again
2. Test feat analysis and analysis note book 
3. interactive umap - check works for nn features with the good neural net
4. Push branch to GitHub

Thurs
1. Explainability stuff - implement in notebook where appropriate
2. Test this explainability stuff on task 1
3. Check this notebook and featanalyse are correct

Friday
1. Copy into correct scripts folder
2. Stat tests
3. Test on final test