In [1]:
import matplotlib as mpl
import pandas as pd
import scanpy as sc
import seaborn as sns
import pathlib
%matplotlib inline
from matplotlib import pyplot as plt
savefig_args = {
    "dpi": 300,
    "bbox_inches": "tight",
    "pad_inches": 0.1,
    "transparent": True,
}

mpl.rc("savefig", dpi=300)

output_suffix = ""
output_formats = [".png", ".pdf"]

sc.set_figure_params(dpi_save=150, frameon=False,
 vector_friendly=True, fontsize=6, figsize=(3,3), format='pdf', transparent=True)

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 20)
pd.set_option("display.width", 100)
%load_ext autoreload
%autoreload 2
%run ../helper.py
sns.set_style("ticks")
plt.style.use('../bursa.mplstyle')

## Load the Data

In [2]:
# configuration:
h5ad = '../../../data/bcells.h5ad.gz'
restrict_to_TBd6 = True
# Flag to control the sharing computation, takes a while only needs done once
compute_sharing = True
vdj_infile = '../../../data/integrated_cell_calls_ambient_annotated.tsv.gz'
sharing_outfile = "../annotate/outputs/sharing_labels_gex.tsv.gz"
seq_identifier = 'vdj_sequence'
# Dictionary to map tissue_combo to DataFrame columns
tissue_combo_to_column = {
    'LN_LN': 'shared_LN_LN',
    'LN_SP': 'shared_LN_SP',
    'LN_PB': 'shared_LN_PB',
    "BM_PB": 'shared_BM_PB',
    "SP_PB": 'shared_SP_PB',
    "SP_BM": 'shared_SP_BM',
    'LN_BM': 'shared_LN_BM',   
    # Add more mappings as needed
}

In [3]:
adata = sc.read_h5ad(h5ad, backed = 'r')


In [4]:
# subset to donor 6, the only one with multiple lymph nodes
if restrict_to_TBd6:
    adata = adata[adata.obs.donor == 'TBd6']
adata = adata.to_memory()
# map whether the IGHC is switched
adata.obs.loc[:,"switched"] = adata.obs.c_call.map(IGH_switched())
# remove GEX profiles without VDJ sequences
adata = adata[adata.obs.vdj_sequence != 'nan']
# use only a subset of the data that is relevant for merge
adata.obs = adata.obs[['cb', 'vdj_sequence', 'celltypist', 'switched']]


In [5]:
# Function to check if two specific locations are shared within a group
def shared_locations(group, subanatomical, loc1, loc2):
    if group.empty:
        return np.nan  # Return NaN if the group is empty
    if subanatomical == False:
        return (loc1 in group['tissue'].values) and (loc2 in group['tissue'].values)
    else:
        return (loc1 in group['subanatomical_location'].values) and (loc2 in group['subanatomical_location'].values)

if compute_sharing:
    # Load the data
    df = pd.read_table(vdj_infile, index_col=0, usecols=["vdj_sequence", "lineage_id", 'v_mismatch', 'donor', 'probable_hq_single_b_cell', 'subanatomical_location', 'tissue', 'locus', 'c_call', 'sample_uid', 'cb'])

    # Drop rows with missing 'locus'
    df = df.dropna(subset=['locus'])

    # Subset to LNs
    df_LN = df[df.tissue == 'LN']

    # Print value counts for subanatomical locations within LNs
    print(df_LN.subanatomical_location.value_counts())

    # Identify sequences shared between LNs
    shared_LNs = df_LN.groupby(seq_identifier)['subanatomical_location'].nunique() > 1
    df['shared_LN_LN'] = df[seq_identifier].map(shared_LNs)

    # Identify sequences found in a single LN
    found_single_LN = df_LN.groupby(seq_identifier)['subanatomical_location'].nunique() == 1
    df['found_in_single_LN'] = df[seq_identifier].map(found_single_LN)

    

    # Dictionary to hold shared categories
    shared_categories = {
        'shared_LN_SP': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'SP'},
        'shared_SP_PB': {'subanatomical': False, 'loc1': 'SP', 'loc2': 'PB'},
        'shared_BM_PB': {'subanatomical': False, 'loc1': 'BM', 'loc2': 'PB'},
        'shared_LN_PB': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'PB'},
        'shared_LN_BM': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'BM'},
        'shared_SP_BM': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'BM'},
    }
    # Iterate through the dictionary to compute shared categories
    for category, params in shared_categories.items():
        # Use the apply method to check for shared locations
        shared = df.groupby(seq_identifier).apply(shared_locations, params['subanatomical'], params['loc1'], params['loc2'])
        # Map the results to the original DataFrame
        df[category] = df[seq_identifier].map(shared)

    # Write the DataFrame to a file
    df.reset_index().to_csv(sharing_outfile, sep="\t")
    
# load output
shared = pd.read_table(sharing_outfile, index_col = 0)
print(shared.shape, "antibody assemblies to analyze for sharing")
shared = shared[shared.probable_hq_single_b_cell == True]
print(shared.shape, "number of hq single B cells transcriptomes to detect GEX differences amongst shared cells")
# Merging the DataFrames with suffixes for common columns
merged_df = pd.merge(adata.obs, shared, left_on=['cb', 'vdj_sequence'], right_on=['cb', 'vdj_sequence'], suffixes=('_left', ''), how='left')

# Dropping the columns duplicated columns for df on right
# Identify columns containing "_right" in their names
cols_to_remove = [col for col in merged_df.columns if '_left' in col]

# Drop the identified columns from the DataFrame
merged_df.drop(columns=cols_to_remove, inplace=True)
merged_df.index = adata.obs.index
adata.obs = merged_df
adata = adata[~adata.obs.sample_uid.isna()]

  df = pd.read_table(vdj_infile, index_col=0, usecols=["vdj_sequence", "lineage_id", 'v_mismatch', 'donor', 'probable_hq_single_b_cell', 'subanatomical_location', 'tissue', 'locus', 'c_call', 'sample_uid', 'cb'])


subanatomical_location
SDLN1    146273
SDLN3     56045
SDLN2     46962
MELN1     33967
MELN      26779
Name: count, dtype: int64


  shared = pd.read_table(sharing_outfile, index_col = 0)


(1160737, 19) antibody assemblies to analyze for sharing
(202489, 19) number of hq single B cells transcriptomes to detect GEX differences amongst shared cells


In [None]:
# Function to check if two specific locations are shared within a group
def shared_locations(group, subanatomical, loc1, loc2):
    if subanatomical == False:
        return (loc1 in group['tissue'].values) and (loc2 in group['tissue'].values)
    else:
        return (loc1 in group['subanatomical_location'].values) and (loc2 in group['subanatomical_location'].values)

if compute_sharing:
    # Load the data
    df = pd.read_table(vdj_infile, index_col=0, usecols=["vdj_sequence", "lineage_id", 'v_mismatch', 'donor', 'probable_hq_single_b_cell', 'subanatomical_location', 'tissue', 'locus', 'c_call', 'sample_uid', 'cb'])

    # Drop rows with missing 'locus'
    df = df.dropna(subset=['locus'])

    # Subset to LNs
    df_LN = df[df.tissue == 'LN']

    # Print value counts for subanatomical locations within LNs
    print(df_LN.subanatomical_location.value_counts())

    # Identify sequences shared between LNs
    shared_LNs = df_LN.groupby(seq_identifier)['subanatomical_location'].nunique() > 1
    df['shared_LN_LN'] = df[seq_identifier].map(shared_LNs)

    # Identify sequences found in a single LN
    found_single_LN = df_LN.groupby(seq_identifier)['subanatomical_location'].nunique() == 1
    df['found_in_single_LN'] = df[seq_identifier].map(found_single_LN)

    

    # Dictionary to hold shared categories
    shared_categories = {
        'shared_LN_SP': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'SP'},
        'shared_SP_PB': {'subanatomical': False, 'loc1': 'SP', 'loc2': 'PB'},
        'shared_BM_PB': {'subanatomical': False, 'loc1': 'BM', 'loc2': 'PB'},
        'shared_LN_PB': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'PB'},
        'shared_LN_BM': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'BM'},
        'shared_SP_BM': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'BM'},
    }
    # Iterate through the dictionary to compute shared categories
    for category, params in shared_categories.items():
        shared = df.groupby(seq_identifier).apply(shared_locations, params['subanatomical'], params['loc1'], params['loc2'])
        df[category] = df[seq_identifier].map(shared)
    # Write the DataFrame to a file
    df.reset_index().to_csv(sharing_outfile, sep="\t")
# load output
shared = pd.read_table(sharing_outfile, index_col = 0)
print(shared.shape, "antibody assemblies to analyze for sharing")
shared = shared[shared.probable_hq_single_b_cell == True]
print(shared.shape, "number of hq single B cells transcriptomes to detect GEX differences amongst shared cells")
# Merging the DataFrames with suffixes for common columns
merged_df = pd.merge(adata.obs, shared, left_on=['cb', 'vdj_sequence'], right_on=['cb', 'vdj_sequence'], suffixes=('_left', ''), how='left')

# Dropping the columns duplicated columns for df on right
# Identify columns containing "_right" in their names
cols_to_remove = [col for col in merged_df.columns if '_left' in col]

# Drop the identified columns from the DataFrame
merged_df.drop(columns=cols_to_remove, inplace=True)
merged_df.index = adata.obs.index
adata.obs = merged_df
adata = adata[~adata.obs.sample_uid.isna()]

  df = pd.read_table(vdj_infile, index_col=0, usecols=["vdj_sequence", "lineage_id", 'v_mismatch', 'donor', 'probable_hq_single_b_cell', 'subanatomical_location', 'tissue', 'locus', 'c_call', 'sample_uid', 'cb'])


subanatomical_location
SDLN1    146273
SDLN3     56045
SDLN2     46962
MELN1     33967
MELN      26779
Name: count, dtype: int64


  shared = pd.read_table(sharing_outfile, index_col = 0)


(1160737, 19) antibody assemblies to analyze for sharing
(202489, 19) number of hq single B cells transcriptomes to detect GEX differences amongst shared cells


# Function to check if two specific locations are shared within a group
# Operates directly on the grouped object to avoid using apply.
def shared_locations_optimized(grouped_df, subanatomical_column, loc1, loc2):
    # Create a set for each group to hold unique locations
    group_sets = {}
    for name, group in grouped_df:
        group_sets[name] = set(group[subanatomical_column])

    # Check if loc1 and loc2 are in the set for each group
    shared_dict = {name: loc1 in loc_set and loc2 in loc_set for name, loc_set in group_sets.items()}
    
    return shared_dict

# Dictionary to hold shared categories
shared_categories = {
        'shared_LN_SP': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'SP'},
        'shared_SP_PB': {'subanatomical': False, 'loc1': 'SP', 'loc2': 'PB'},
        'shared_BM_PB': {'subanatomical': False, 'loc1': 'BM', 'loc2': 'PB'},
        'shared_LN_PB': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'PB'},
        'shared_LN_BM': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'BM'},
        'shared_SP_BM': {'subanatomical': False, 'loc1': 'LN', 'loc2': 'BM'},
    }

# Iterate through the dictionary to compute shared categories
for category, params in shared_categories.items():
    subanatomical_column = 'subanatomical_location' if params['subanatomical'] else 'tissue'
    grouped_df = df.groupby(seq_identifier)
    shared_dict = shared_locations_optimized(grouped_df, subanatomical_column, params['loc1'], params['loc2'])
    
    # Map the shared information back to the DataFrame
    df[category] = df[seq_identifier].map(shared_dict)


In [7]:
# Convert all non-numeric columns in adata.obs to string
for col in adata.obs.columns:
    if not pd.api.types.is_numeric_dtype(adata.obs[col]):
        adata.obs[col] = adata.obs[col].astype(str)

# Now write to H5AD
if restrict_to_TBd6:
    adata.write_h5ad("TBd6_sharing.h5ad.gz", compression='gzip')
else:
    adata.write_h5ad("all_sharing.h5ad.gz", compression="gzip")

  adata.obs[col] = adata.obs[col].astype(str)
