In [1]:
!pip install pandas numpy scikit-learn scipy seaborn matplotlib phenograph

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting phenograph
  Downloading PhenoGraph-1.5.7-py3-none-any.whl (159 kB)
[K     |████████████████████████████████| 159 kB 1.7 MB/s eta 0:00:01
Collecting leidenalg>=0.8.2
  Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 10.7 MB/s eta 0:00:01
[?25hCollecting igraph<0.12,>=0.10.0
  Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 39.3 MB/s eta 0:00:01
[?25hCollecting texttable>=1.6.2
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph, leidenalg, phenograph
Successfully installed igraph-0.11.8 leidenalg-0.10.2 phenograph-1.5.7 texttable-1.7.0
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


# ACDC on the Bodenmiller dataset

Key infos on the dataset:
* Total samples: 16
* Paired replicates: 8 pairs (stimulated and reference conditions for each patient)
* 24 markers in total. Markers are classified as type markers (cell identity) or state markers (activation or signaling state).


Input to ACDC:

* FCS data: Flow or mass cytometry data files (e.g., PBMC8_30min_patient1_BCR-XL.fcs)
* Cell type-marker table: Specifies which markers define each cell type
* Metadata (to link samples, conditions, and patients)


Output from ACDC:

* Classified cell populations: Each cell is annotated with a cell type.
* Unknown populations: Cells that do not fit predefined markers.
* Visualizations such as:
1) UMAP or t-SNE plots of cell populations.
2) Heatmaps of marker expressions.
3) Comparison of cell frequencies between conditions.

In [11]:
import pandas as pd
import numpy as np 
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import phenograph
from sklearn.manifold import TSNE
import sys
sys.path.append('/dss/dsshome1/0F/di93quv/Systems_biomedicine/acdc/')
from ACDC.cell_type_annotation import *
import ACDC
import matplotlib as mpl
import matplotlib.patches as mpatches
%matplotlib inline 
import umap.umap_ as umap
from matplotlib import cm

In [2]:
def load_data(data_path, panel_data_path):
    """Load the main dataset and the panel data."""
    # Load panel data
    panel_data = pd.DataFrame({
        "fcs_colname": ["CD3(110:114)Dd", "CD45(In115)Dd", "pNFkB(Nd142)Dd", "pp38(Nd144)Dd", "CD4(Nd145)Dd",
                        "CD20(Sm147)Dd", "CD33(Nd148)Dd", "pStat5(Nd150)Dd", "CD123(Eu151)Dd", "pAkt(Sm152)Dd",
                        "pStat1(Eu153)Dd", "pSHP2(Sm154)Dd", "pZap70(Gd156)Dd", "pStat3(Gd158)Dd", "CD14(Gd160)Dd",
                        "pSlp76(Dy164)Dd", "pBtk(Er166)Dd", "pPlcg2(Er167)Dd", "pErk(Er168)Dd", "pLat(Er170)Dd",
                        "IgM(Yb171)Dd", "pS6(Yb172)Dd", "HLA-DR(Yb174)Dd", "CD7(Yb176)Dd"],
        "antigen": ["CD3", "CD45", "pNFkB", "pp38", "CD4", "CD20", "CD33", "pStat5", "CD123", "pAkt",
                    "pStat1", "pSHP2", "pZap70", "pStat3", "CD14", "pSlp76", "pBtk", "pPlcg2", "pErk",
                    "pLat", "IgM", "pS6", "HLA-DR", "CD7"],
        "marker_class": ["type", "type", "state", "state", "type", "type", "type", "state", "type", "state",
                         "state", "state", "state", "state", "type", "state", "state", "state", "state",
                         "state", "type", "state", "type", "type"]
    })

    # Load main dataset
    df = pd.read_csv(data_path, sep=',')
    fcs_to_antigen = dict(zip(panel_data["fcs_colname"], panel_data["antigen"]))

    # Rename columns
    df = df.rename(columns=fcs_to_antigen)
    return df, panel_data

In [3]:
def preprocess_data(df, desired_antigens, available_channels):
    """Subset and preprocess data."""
    # Subset columns
    subset_columns = [col for col in desired_antigens if col in df.columns]
    df = df[subset_columns]

    # ArcSinh transformation
    df[available_channels] = np.arcsinh((df[available_channels].values - 1.0) / 5.0)
    return df

In [4]:
def compute_cell_type_scores(df, table, threshold):
    """Compute cell type scores."""
    # Extract features
    X0 = df.to_numpy()

    # Compute scores
    mk_model = compute_marker_model(df, table, 0.0)
    score0 = get_score_mat(X0, [], table, [], mk_model)
    score0 = np.concatenate([score0, 1.0 - score0.max(axis=1)[:, np.newaxis]], axis=1)

    # Compute unique index for clusters
    ct_index0 = get_unique_index(X0, score0, table, threshold)
    return X0, score0, ct_index0

In [5]:
def plot_heatmap(feature_matrix, table):
    """Plot heatmap of average marker expression."""
    sns.set()
    fig = plt.figure(figsize=(6, 4))
    feature_mat_df = pd.DataFrame(feature_matrix, columns=table.columns, index=table.index)
    ax = sns.heatmap(feature_mat_df, vmin=0.0, vmax=8.5, cmap="YlGnBu")
    plt.xticks(rotation=90)
    plt.show()

In [6]:
def plot_dim_reduction(X_plot, y_plot, idx2ct, landmark_label, method='tsne'):
    """Plot t-SNE or UMAP projections."""
    if method == 'tsne':
        reducer = TSNE(n_components=2, random_state=0)
    elif method == 'umap':
        reducer = umap.UMAP(n_components=2, random_state=0)
    else:
        raise ValueError("Invalid method. Choose 'tsne' or 'umap'.")

    Xre = reducer.fit_transform(X_plot)

    # Plot projections
    cmap = cm.Accent
    fig = plt.figure(figsize=(16, 12))

    for idx, key in enumerate(idx2ct):
        indices = np.where(y_plot == key)[0]
        plt.plot(Xre[indices, 0], Xre[indices, 1], '.', color=cmap(idx / len(idx2ct)), alpha=0.6, label=key)

    # Plot landmark points
    for idx, key in enumerate(landmark_label):
        ax, ay = Xre[idx, 0], Xre[idx, 1]
        plt.plot(ax, ay, 'o', color=cmap(idx2ct.index(key) / len(idx2ct)), alpha=1, markersize=10, markeredgewidth=0.5)

    # Add legend
    plt.legend(handles=[mpatches.Patch(color=cmap(idx / len(idx2ct)), label=key) for idx, key in enumerate(idx2ct)],
               bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size': 16})
    plt.xticks([])
    plt.yticks([])
    plt.title(f"{method.upper()} Projection of Data", fontsize=20)
    plt.show()

In [7]:
def plot_histogram(df, marker):
    """Plot histogram of marker expression."""
    sns.histplot(df[marker], kde=True, bins=30, color='blue')
    plt.title(f"Histogram of {marker} Expression")
    plt.xlabel(marker)
    plt.ylabel("Frequency")
    plt.show()

In [8]:
def plot_cell_type_distribution(cell_type_counts):
    """Plot cell type distribution as a pie chart."""
    plt.figure(figsize=(8, 8))
    plt.pie(cell_type_counts.values(), labels=cell_type_counts.keys(), autopct='%1.1f%%', startangle=140, colors=cm.Accent.colors)
    plt.title("Cell Type Distribution")
    plt.show()

In [12]:
def main():
    # Paths
    data_path = '/dss/dsshome1/0F/di93quv/Systems_biomedicine/acdc/data/CyToF_data_Bodenmiller/csv_files/PBMC8_30min_patient1_BCR-XL.csv'
    panel_data_path = None  # Not used here

    # Load and preprocess data
    df, panel_data = load_data(data_path, panel_data_path)
    desired_antigens = ['LD', 'CD19', 'CD161', 'CD4', 'CD8', 'HLA-DR', 'CD14', 'CD3', 'CD11c', 'CD69', 'CD56', 'CD16', 'VA72', 'GDTCR']
    available_channels = ['CD4', 'HLA-DR', 'CD14', 'CD3']
    df = preprocess_data(df, desired_antigens, available_channels)

    # Define cell type-marker table
    table = pd.read_csv("../data/CyToF_data_Bodenmiller/ACDC_compatible_classification_matrix.csv").set_index('Cell Type')

    # Compute scores
    threshold = 0.5
    X0, score0, ct_index0 = compute_cell_type_scores(df, table, threshold)

    # Plotting
    plot_histogram(df, 'CD4')  # Example marker
    cell_type_counts = {'Type1': 50, 'Type2': 100}  # Replace with actual counts
    plot_cell_type_distribution(cell_type_counts)
    plot_heatmap(score0, table)
    plot_dim_reduction(X0, [], list(table.index), [], method='umap')

In [13]:
if __name__ == "__main__":
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[available_channels] = np.arcsinh((df[available_channels].values - 1.0) / 5.0)


KeyError: 'LD'