In [None]:
import sys
path = '/gpfs/commons/groups/gursoy_lab/mstoll/'
sys.path.append(path)

import pandas as pd
import numpy as np 
import torch
import tensorboard
import os
import pickle

from torch.utils.tensorboard import SummaryWriter
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE



from codes.models.data_form.DataForm import DataTransfo_1SNP, PatientList, get_paths_pheno_dicts
import matplotlib.pyplot as plt

In [None]:
### data constants:
CHR = 1
SNP = 'rs673604'
pheno_method = 'Abby' # Paul, Abby
rollup_depth = 4
Classes_nb = 2 #nb of classes related to an SNP (here 0 or 1)
vocab_size = None # to be defined with data
padding_token = 0
prop_train_test = 0.8
load_data = False
save_data = False
remove_none = True
decorelate = False
equalize_label = False
threshold_corr = 0.9
threshold_rare = 50
remove_rare = 'all' # None, 'all', 'one_class'
compute_features = True
padding = True
list_env_features = ['age', 'sex']
### data format
batch_size = 20
data_share = 1

In [None]:
dataT = DataTransfo_1SNP(SNP=SNP,
                         CHR=CHR,
                         method=pheno_method,
                         padding=padding,  
                         pad_token=padding_token, 
                         load_data=load_data, 
                         save_data=save_data, 
                         compute_features=compute_features,
                         prop_train_test=prop_train_test,
                         remove_none=True,
                         equalize_label=equalize_label,
                         rollup_depth=rollup_depth,
                         decorelate=decorelate,
                         threshold_corr=threshold_corr,
                         threshold_rare=threshold_rare,
                         remove_rare=remove_rare, 
                         list_env_features=list_env_features,
                         data_share=data_share)
#patient_list = dataT.get_patientlist()


In [None]:
data, labels, indices_env, name_envs = dataT.get_tree_data(with_env=False)

In [None]:
size = 10000
data_final = data[:size]
labels_final = labels[:size]

In [None]:
embeddings_file = '/gpfs/commons/groups/gursoy_lab/mstoll/codes/Data_Files/Embeddings/Abby/embedding_abby_no_1_diseases.pth'
embeddings = torch.load(embeddings_file)
log_tensorboard_path= f'/gpfs/commons/groups/gursoy_lab/mstoll/codes/logs/plots/patients/tensorboard/patients_{CHR}_{SNP}'

In [None]:
embeddings_not_padded = embeddings[1:]
patient_data_list = [embeddings_not_padded[list_diseases].mean(axis=0) for list_diseases in data_final]
patient_data_list_final = np.array(patient_data_list)

In [None]:
# Apply DBSCAN
eps = 0.0027  # radius for neighborhood
min_samples = 10  # minimum number of points to form a core point
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
clusters= dbscan.fit_predict(patient_data_list_final)

In [None]:
nb_clusters = len(np.unique(clusters))

In [None]:
# Apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
reduced_data = tsne.fit_transform(patient_data_list_final)

# Plot the 2D t-SNE representation
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels_final, cmap='viridis', marker='o', edgecolors='k')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

In [None]:
prop_clusters = []
for cluster in range(nb_clusters-1):
    data_cluster = patient_data_list_final[clusters==cluster]
    labels_cluster = labels_final[clusters==cluster]
    prop = np.sum(labels_cluster==0)/np.sum(labels_cluster==1)
    prop_clusters.append(prop)
prop_clusters =  np.array(prop_clusters)

In [None]:
prop_clusters

In [None]:
prop_clusters

In [None]:
data_cluster = patient_data_list_final[clusters==23]
labels_cluster = labels_final[clusters==23]
prop = np.sum(labels_cluster==0)/np.sum(labels_cluster==1)

In [None]:
len(data_cluster)

In [None]:
# Apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=4)
reduced_data = tsne.fit_transform(data_cluster)

# Plot the 2D t-SNE representation
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels_cluster, cmap='viridis', marker='o', edgecolors='k')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

In [None]:
# Plot the results
plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', marker='o', edgecolors='k')
plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

# Generate synthetic data with clusters
data, _ = make_blobs(n_samples=300, centers=3, random_state=42)

# Apply DBSCAN
eps = 0.5  # radius for neighborhood
min_samples = 5  # minimum number of points to form a core point
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(data)



In [None]:
data