In [None]:
import pandas as pd 
import numpy as np
import readfcs
import scanpy as sc

import os 

from parc import PARC

In [None]:
data = pd.read_pickle('../cytofdata/uncorrected_batch_1_4.pkl')
data

In [None]:
data = data[~data['file_id'].isin(['RT', 'RJ'])]

In [None]:
donors = ['HDBM_184', 
          'HDBM_263', 
          'HDBM_446',
          'HDBM_436', 
          'HDBM_525', 
          'HDBM_532', 
          'HDBM_578', 
          'HDBM_630',
          'HDBM_663', 
          'HDPB_259', 
          'HDPB_416', 
          'HDPB_424', 
          'HDPB_433',
          'HDPB_439', 
          'HDPB_518', 
          'HDPB_522', 
          'HDPB_597', 
          'HDPB_625',
          'HDPB_656', 
          'HDBM_180'
        ]

data_donors = data[data['file_id'].isin(donors)]
data_patients = data[~data['file_id'].isin(donors)]

data_donors['test_time'] = None
data_donors['patient_number'] = data_donors['file_id']

data_patients[['patient_number', 'test_time']] = data['file_id'].str.split('_', expand=True)
data_patients['test_time'] = data_patients['test_time'].str.replace('t', '').astype(str)

data = pd.concat([data_patients, data_donors])

data.drop('file_id', axis=1, inplace=True)

data

In [None]:

internal_and_external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '142Nd_cCaspase3', 
                '143Nd_pCRKL Y207', 
                '144Nd_pTyr',
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '150Nd_pSTAT5 Y694', 
                '151Eu_pSTAT3 S727', 
                '152Sm_CD13',
                '153Eu_pSTAT1 Y701', 
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '156Gd_pp38 T180Y182', 
                '157Gd_CD8a', 
                '158Gd_pSTAT3 Y705',
                '159Tb_pMAPKAPK T334', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '165Ho_pCREB S133', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '170Er_pSRC Y418',
                '171Yb_pERK T202Y204', 
                '172Yb_pS6 S235S236', 
                '173Yb_STAT3tot',
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '176Yb_pS6 S240244', 
                '195Pt_mBC2', 
                '209Bi_CD11b']    
        

external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '152Sm_CD13',
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '157Gd_CD8a', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '195Pt_mBC2', 
                '209Bi_CD11b']

internal = list(set(internal_and_external) - set(external))

In [None]:
from scipy.spatial.distance import cdist

def assign_clusters(data, centroids):
    distances = cdist(data, centroids, 'euclidean')
    closest_centroids = np.argmin(distances, axis=1)
    return closest_centroids


In [None]:
from sklearn.model_selection import train_test_split

cluster_file_path = 'clusters/data_clusters_t0_batch_1_4_075.pkl'

if os.path.exists(cluster_file_path):
    print('Cluster found, loading from file')
    
    data_with_clusters = pd.read_pickle(cluster_file_path)
    
else:
    print('No saved cluster found, running PARC')

    cluster_data_patients = data[data['test_time'] == '0']
    mapping_data_patients = data[(data['test_time'] == '1') | (data['test_time'] == '2')]

    data_donors = data[data['test_time'].isnull()]
    cluster_donor_data, mapping_donor_data = train_test_split(data_donors, train_size=0.5, random_state=42)

    cluster_data = pd.concat([cluster_data_patients, cluster_donor_data])
    mapping_data = pd.concat([mapping_data_patients, mapping_donor_data])

    cluster_data_external = cluster_data[external]
    mapping_data_external = mapping_data[external]

    parc = PARC(cluster_data_external.values, 
                resolution_parameter=0.75,
                knn = 30, 
                small_pop = 100,
                hnsw_param_ef_construction = 2000)

    parc.run_PARC()

    cluster_data_labels = parc.labels

    centroids = np.array([cluster_data_external.values[cluster_data_labels == label].mean(axis=0) for label in set(cluster_data_labels)])

    mapping_data_labels = assign_clusters(mapping_data_external.values, centroids)

    cluster_data['cluster'] = cluster_data_labels
    mapping_data['cluster'] = mapping_data_labels

    data_with_clusters = pd.concat([cluster_data, mapping_data])

    data_with_clusters.to_pickle(cluster_file_path)

data_with_clusters

In [None]:
import plotly.express as px

columns_heatmap = external.copy()
columns_heatmap.append('cluster')

data_external_markers = data_with_clusters[columns_heatmap]
data_heatmap = data_external_markers.groupby(['cluster']).median()

fig = px.imshow(
    data_heatmap,
    labels=dict(x="Marker", y="Cluster", color="Marker Expression"),
    x=data_heatmap.columns,
    y=data_heatmap.index,
    color_continuous_scale='Turbo',  
    aspect='auto',
    width=1200,  
    height=900   
)

fig.update_layout(
    title="Median Marker Intensity by Cluster",
    title_font_size=24,
    xaxis=dict(
        tickmode='array',
        tickvals=data_heatmap.columns,
        ticktext=data_heatmap.columns,
        title_font_size=18,
        tickfont_size=16
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=data_heatmap.index,
        ticktext=data_heatmap.index,
        title_font_size=18,
        tickfont_size=16
    ),
    coloraxis_colorbar=dict(
        title='Marker Expression',
        title_font_size=18,
        tickfont_size=16
    )
)

fig.show()


In [None]:
data_heatmap.to_csv('heatmap_t0_batch_1_4.csv')

In [None]:
data_patients = data_with_clusters[data_with_clusters['test_time'] != None]
data_donors = data_with_clusters[data_with_clusters['test_time'] == None]

data_patients.to_pickle('clusters/data_patients_t0_batch_1_4.pkl')
data_donors.to_pickle('clusters/data_donors_t0_batch_1_4.pkl')

In [None]:
annotaded_clusters = {
27 :	'PreNeu1',
26 :	'pDC',
25 :	'PreNeu3',
24 :	'NeutrophilsCD25',
23 :	'PreNeu2',
22 :	'NeutrophilsHLA',
21 :	'LinNeg',
20 :	'Doublets',
19 :	'Neutrophils',
18 :	'Bcells',
17 :	'HSPCs',
16 :	'NKcells',
15 :	'Tc',
14 :	'Th',
13 :	'Monocytes',
12 :	'Basophils',
11 :	'Neutrophils',
10 :	'Eosinophils',
9 :	    'NeutrophilCD16neg',
8 :	    'NeutrophilCD16neg',
7 :	    'NeutrophilCD16neg',
6 :	    'NeutrophilCD16neg',
5 :	    'NeutrophilCD16neg',
4 :	    'NeutrophilCD16neg',
3 :	    'PreNeu',
2 :	    'Neutrophils',
1 :	    'Neutrophils',
0 :	    'Neutrophils',
}

In [None]:
data_with_annotated_clusters = data_with_clusters.copy()

data_with_annotated_clusters['cluster'] = data_with_annotated_clusters['cluster'].replace(annotaded_clusters)
data_with_annotated_clusters.to_pickle('../cytofdata/pool1_with_annotated_clusters')

In [None]:
import plotly.express as px

data_external_markers = data_with_annotated_clusters[columns_heatmap]

data_heatmap = data_external_markers.groupby(['cluster']).median()

fig = px.imshow(
    data_heatmap,
    labels=dict(x="Marker", y="Cluster", color="Marker Expression"),
    x=data_heatmap.columns,
    y=data_heatmap.index,
    color_continuous_scale='Turbo',  
    aspect='auto',
    width=1200,  
    height=900   
)

fig.update_layout(
    title="Median Marker Intensity by Merged Cluster",
    title_font_size=20,
    xaxis=dict(
        tickmode='array',
        tickvals=data_heatmap.columns,
        ticktext=data_heatmap.columns,
        title_font_size=16,
        tickfont_size=14
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=data_heatmap.index,
        ticktext=data_heatmap.index,
        title_font_size=16,
        tickfont_size=14
    ),
    coloraxis_colorbar=dict(
        title='Marker Expr.',
        title_font_size=16,
        tickfont_size=14
    )
)

fig.show()
