# Load library

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import anndata as ann
import random, os
from scipy.stats import pearsonr as pr
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score as f1
from sklearn.metrics import precision_recall_curve as prc
from sklearn.metrics import silhouette_score as sil
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, average_precision_score
from sklearn.metrics import silhouette_score
from torch_geometric.nn import TransformerConv
from torch_geometric.data import Data
import psutil
import os, sys
import gc
import scipy.sparse as sp
from harmony import harmonize
from tqdm import tqdm
import h5py

In [None]:
sc.set_figure_params(dpi=200)

# General processing functinos

In [None]:
def whats_memory_eater():
    # Build reverse map of object id -> variable name from globals
    name_map = {id(obj): name for name, obj in globals().items()}

    # Get all tracked objects
    all_objects = gc.get_objects()

    # Safely get size and match variable name
    sizes = []
    for obj in all_objects:
        try:
            size = sys.getsizeof(obj)
            obj_id = id(obj)
            name = name_map.get(obj_id, None)
            sizes.append((size, type(obj), name, repr(obj)[:100]))
        except Exception:
            continue

    # Sort and print top 10
    sizes.sort(reverse=True, key=lambda x: x[0])

    for size, obj_type, name, preview in sizes[:10]:
        print(f"Size: {size / 1024**3} GB | Type: {obj_type} | Name: {name} | Object: {preview}")


In [None]:
def memory_usgae():
    gc.collect()
    process = psutil.Process(os.getpid())
    memory_gb = process.memory_info().rss / 1024**3  # in GB

    print(f"Current memory usage: {memory_gb:.2f} GB")

In [None]:
def pca_and_umap(adata):
    sc.tl.pca(adata, svd_solver="arpack")
    # sc.pl.pca(ad, color='source')
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)

In [None]:
def load_and_preprocess_project(base_path, Project_ID, metadata_idx_key='Cell', Primary_or_Metastatic = 'Primary', further_pre = False, file_prefix= None):
    """
    Load and preprocess a single scRNA-seq project with standard filtering and UMAP.
    
    Assumes the base_path contains:
        - One .mtx file (count matrix)
        - One barcodes.csv
        - One features.csv
        - One meta_all.csv
    """
    # Automatically detect files
    files = os.listdir(base_path)
    metadata_file = None
    
    if file_prefix == None:

        mtx_file = [os.path.join(base_path, f) for f in files if f.endswith('.mtx')][0]
        print(mtx_file)
        barcodes_file = [os.path.join(base_path, f) for f in files if 'barcode' in f][0]
        print(barcodes_file)
        try:
            features_file = [os.path.join(base_path, f) for f in files if 'feature' in f][0]
        except:
            features_file = [os.path.join(base_path, f) for f in files if 'genes' in f][0]
        print(features_file)
        metadata_file = [os.path.join(base_path, f) for f in files if 'meta' in f][0]
        print(metadata_file)
    else:
        for f in files:
            if not f.startswith(file_prefix):
                continue
            if f.endswith('mtx'):
                mtx_file = os.path.join(base_path, f)
                print(mtx_file)
            elif 'barcode' in f:
                barcodes_file = os.path.join(base_path, f)
                print(barcodes_file)
            elif 'feature' in f:
                features_file = os.path.join(base_path, f)
                print(features_file)
            elif 'genes' in f:
                features_file = os.path.join(base_path, f)
                print(features_file)
            elif 'meta' in f:
                metadata_file = os.path.join(base_path, f)
                print(metadata_file)
            else:
                continue
    # print(metadata_file)
    print(f"Loading: {mtx_file}")

    # Load matrix
    adata = sc.read_mtx(mtx_file)
    adata = adata.transpose()  # Important: make cells as rows, genes as columns

    # Load barcodes and features
    if barcodes_file.endswith('tsv'):
        barcodes = pd.read_csv(barcodes_file, sep='\t', header=None)  # no header=None here
    else:
        barcodes = pd.read_csv(barcodes_file)  # no header=None here
    display(barcodes)
    
    if features_file.endswith('tsv'):
        genes = pd.read_csv(features_file, sep='\t', header=None)  # no header=None here
    else:
        genes = pd.read_csv(features_file)  # no header=None here
    display(genes)

    # Assign barcodes and gene names (convert to string)
    if barcodes.shape[1] > 1:
        adata.obs_names = barcodes.iloc[:, 1].astype(str).values
    else:
        adata.obs_names = barcodes.iloc[:, 0].astype(str).values
    
    if genes.shape[1] > 1:
        adata.var_names = genes.iloc[:, 1].astype(str).values
    else:
        adata.var_names = genes.iloc[:, 0].astype(str).values
    # adata.var_names = genes.iloc[:, 0].astype(str).values
    display(adata.to_df())

    # Load and merge metadata
    # if metadata_file
    try:
        if metadata_file.endswith('tsv'):
            metadata = pd.read_csv(metadata_file, sep='\t')
        elif metadata_file.endswith('csv'):
            metadata = pd.read_csv(metadata_file)
        metadata.index = metadata[metadata_idx_key]
        adata.obs = adata.obs.join(metadata, how='left')
    except:
        pass         
    
    # ======== Standard preprocessing ========

    # Calculate QC metrics
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    # Standard cell filtering
    adata = adata[(adata.obs['n_genes_by_counts'] >= 200) & 
                  (adata.obs['n_genes_by_counts'] <= 5000) & 
                  (adata.obs['pct_counts_mt'] <= 20)].copy()
    
    adata.obs['Project_ID'] = Project_ID
    adata.obs['Primary_or_Metastatic'] = Primary_or_Metastatic
    
    # Normalize and log transform
    adata.raw = adata.copy()
    
    if further_pre:
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        # Highly variable genes
        # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

        # Keep only HVGs
        # adata = adata[:, adata.var.highly_variable]

        # Scale
        sc.pp.scale(adata, max_value=10)

        # PCA
        sc.tl.pca(adata, svd_solver='arpack')

        # Neighbors
        sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)

        # UMAP
        sc.tl.umap(adata)

    print(f"Finished processing {Project_ID}. Shape: {adata.shape}")

    return adata


In [None]:
def filter_and_recompute(adata, celltype_col, celltypes_to_keep, further_pre = False):
    """
    Filters an AnnData object to keep only specified cell types, 
    then recalculates PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object
    - celltype_col: str, the column in adata.obs containing cell type annotations
    - celltypes_to_keep: list of str, the cell types you want to keep

    Returns:
    - filtered and recalculated AnnData object
    """
    # Step 1: Filter cells
    print(f"Original shape: {adata.shape}")
    adata_filtered = adata[adata.obs[celltype_col].isin(set(celltypes_to_keep))].copy()
    print(f"Filtered shape: {adata_filtered.shape}")

    # Step 2: Recalculate PCA and UMAP
    # (Assumes data is already normalized and scaled)
    if further_pre:
        sc.tl.pca(adata_filtered, svd_solver='arpack')
        sc.pp.neighbors(adata_filtered, n_neighbors=15, n_pcs=40)
        sc.tl.umap(adata_filtered)

    print("Recalculated PCA and UMAP.")
    return adata_filtered

In [None]:
def recompute_pca_and_umap(adata):
    """
    Filters an AnnData object to keep only specified cell types, 
    then recalculates PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object
    - celltype_col: str, the column in adata.obs containing cell type annotations
    - celltypes_to_keep: list of str, the cell types you want to keep

    Returns:
    - filtered and recalculated AnnData object
    """
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
    sc.tl.umap(adata)

    print("Recalculated PCA and UMAP.")
    return adata

In [None]:
def reprocess_from_raw_layer(adata, Project_ID, Primary_or_Metastatic = 'Primary', further_pre = False):
    """
    Reprocess a Scanpy AnnData object using its raw layer (e.g., from a published .h5ad).
    This includes normalization, HVG selection, PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object, must have .raw set

    Returns:
    - Processed AnnData object (modifies in place)
    """

    # Check if raw exists
    if adata.raw is None:
        raise ValueError("AnnData object has no .raw attribute. Cannot proceed with reprocessing.")

    # Extract raw counts
    adata.X = adata.raw.X.copy()
    adata.var = adata.raw.var.copy()
    adata.var_names = adata.raw.var_names.copy()

    # Recalculate mitochondrial content
    adata.var['mt'] = adata.var_names.str.upper().str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    # Standard filtering (optional)
    adata = adata[(adata.obs['n_genes_by_counts'] >= 200) &
                  (adata.obs['n_genes_by_counts'] <= 5000) &
                  (adata.obs['pct_counts_mt'] <= 20)].copy()
    
    adata.obs['Project_ID'] = Project_ID
    adata.obs['Primary_or_Metastatic'] = Primary_or_Metastatic
    
    if further_pre:
        # Normalize and log transform
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        # HVG selection
        # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        # adata = adata[:, adata.var.highly_variable]

        # Scale
        sc.pp.scale(adata, max_value=10)

        # PCA, neighbors, UMAP
        sc.tl.pca(adata, svd_solver='arpack')
        sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
        sc.tl.umap(adata)

    print(f"Reprocessed dataset. Final shape: {adata.shape}")
    return adata


In [None]:
def reprocess_all(adata, further_pre = True):
    """
    Reprocess a Scanpy AnnData object using its raw layer (e.g., from a published .h5ad).
    This includes normalization, HVG selection, PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object, must have .raw set

    Returns:
    - Processed AnnData object (modifies in place)
    """

    # Check if raw exists
    if adata.raw is None:
        raise ValueError("AnnData object has no .raw attribute. Cannot proceed with reprocessing.")

    # Extract raw counts
    adata.X = adata.raw.X.copy()
    adata.var = adata.raw.var.copy()
    adata.var_names = adata.raw.var_names.copy()

    # Recalculate mitochondrial content
    adata.var['mt'] = adata.var_names.str.upper().str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    
    print('Standard filtering...')
    # Standard filtering (optional)
    adata = adata[(adata.obs['n_genes_by_counts'] >= 200) &
                  (adata.obs['n_genes_by_counts'] <= 5000) &
                  (adata.obs['pct_counts_mt'] <= 20)].copy()
    
    # adata.obs['Project_ID'] = Project_ID
    # adata.obs['Primary_or_Metastatic'] = Primary_or_Metastatic
    
    if further_pre:
        # Normalize and log transform
        print('Normalizing...')
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        # HVG selection
        # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        # adata = adata[:, adata.var.highly_variable]
        '''
        sc.pp.highly_variable_genes(
            adata,
            flavor="seurat_v3",  # best for batch-aware HVG selection
            n_top_genes=2000,
            batch_key="Final_sample_id"  # or whatever your batch label column is
        )
        '''
        
        # adata = adata[:, adata.var.highly_variable].copy()

        # Scale
        # print('Scaling...')
        # sc.pp.scale(adata, max_value=10)
        print('Computing PCA...')
        # PCA, neighbors, UMAP
        sc.tl.pca(adata, zero_center=False)
        
        print('Computing neighbors...')
        sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
        
        print('Computing UMAP...')
        sc.tl.umap(adata)

    print(f"Reprocessed dataset. Final shape: {adata.shape}")
    return adata


# Beast Cacner (BRCA)

## A multi-modal single-cell and spatial expression map of metastatic breast cancer biopsies across clinicopathological features

Paper: https://www.nature.com/articles/s41591-024-03215-z#Fig1

Data downloaded from: https://singlecell.broadinstitute.org/single_cell/study/SCP2702/htapp-mbc

https://cellxgene.cziscience.com/collections/a96133de-e951-4e2d-ace6-59db8b3bfb1d

Link: 
- Matrix: https://datasets.cellxgene.cziscience.com/9dff3651-e629-4519-aaab-dbd21b6b02b1.h5ad
- Patient clinical data: https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-024-03215-z/MediaObjects/41591_2024_3215_MOESM3_ESM.xlsx

In [None]:
ad = sc.read_h5ad('./Data/BRCA/Multi_modal_breast_cancer/Multi_modal_breast_cancer.scRNAseq.h5ad')
ad

In [None]:
ad.raw.shape

In [None]:
# re-process the adata
ad = reprocess_from_raw_layer(ad, 
                              Project_ID='Multi_modal_breast_cancer', 
                              Primary_or_Metastatic='Metastatic',
                              further_pre=True)

In [None]:
sc.pl.umap(ad, color=['sampleid'])
sc.pl.umap(ad, color=['author_cell_type'])
sc.pl.umap(ad, color=['tissue'])
sc.pl.umap(ad, color=['histology_breast'])


In [None]:
# keep only cancer cells
ad = filter_and_recompute(ad, celltype_col='author_cell_type', celltypes_to_keep=['MBC', 'MBC_stem-like', 'MBC_chondroid'])
pca_and_umap(ad)
ad

In [None]:
sc.pl.umap(ad, color=['donor_id'])
sc.pl.umap(ad, color=['author_cell_type'])
sc.pl.umap(ad, color=['tissue'])
sc.pl.umap(ad, color = 'histology_breast')
sc.pl.umap(ad, color = 'histology_biopsy')


In [None]:
ad.raw.shape

In [None]:
ad

In [None]:
ad.obs['Final_cancer_type'] = 'Breast Cancer'
ad.obs['Final_histological_subtype'] = ad.obs.histology_biopsy
ad.obs['Final_molecular_subtype'] = ad.obs.receptors_primary
ad.obs['Final_tissue'] = ad.obs.tissue
ad.obs['Final_sample_id'] = ad.obs.donor_id

In [None]:
patient_clinical_df = pd.read_csv('./Data/BRCA/Multi_modal_breast_cancer/Patient_clinical.txt', sep='\t')
patient_clinical_df = patient_clinical_df[patient_clinical_df['Profiling method'] == 'scRNAseq']
patient_clinical_df

In [None]:
# Step 1: Reset index for merging, but save cell IDs
ad.obs['tmp_donor_id'] = [i.replace('SMP-', '').replace('-', '_').replace('HTAPP', 'HTA1') for i in ad.obs['sampleid']]
merged = ad.obs.reset_index().merge(
    patient_clinical_df,
    left_on='tmp_donor_id',
    right_on='HTAN ID',
    how='left'
)

# Step 2: Restore original index (cell barcodes)
merged = merged.set_index('cellid')

# Step 3: Assign back
ad.obs = merged
ad

In [None]:
# add more clinical information
ad.obs['Final_patient_age'] = [int(i.split('-')[0]) for i in ad.obs['development_stage']]
ad.obs['Final_patient_stage'] = ad.obs['stage_at_diagnosis']
ad.obs['Final_patient_treatment'] = ad.obs['Treatment most recent class']+' '+ad.obs['Treatment status']

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/Multi_modal_breast_cancer.BRCA.h5ad', compression='gzip')

In [None]:
ad

## A single-cell and spatially resolved atlas of human breast cancers

Paper: https://www.nature.com/articles/s41588-021-00911-1#Fig1

Data downloaded from: https://cellxgene.cziscience.com/collections/65db5560-7aeb-4c66-b150-5bd914480eb8

Link: 
- Matrix: https://datasets.cellxgene.cziscience.com/36ab5d3d-158e-4988-9700-e14fc7102fe4.h5ad
- Patient clinical: https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-021-00911-1/MediaObjects/41588_2021_911_MOESM4_ESM.xlsx

In [None]:
ad = sc.read_h5ad('./Data/BRCA/Wu_etal_2021_BRCA/Wu_etal_2021_BRCA.h5ad')
ad

In [None]:
ad.raw.shape

In [None]:
# re-process the adata
ad = reprocess_from_raw_layer(ad, 
                              Project_ID='Wu_etal_2021_BRCA', 
                              Primary_or_Metastatic='Primary',
                              further_pre=True)

In [None]:
sc.pl.umap(ad, color='celltype_major')

In [None]:
ad.obs.celltype_major.value_counts()

In [None]:
ad = filter_and_recompute(ad, 
                          celltype_col='celltype_major', 
                          celltypes_to_keep=['Cancer Epithelial'],
                          further_pre=True)
ad

In [None]:
sc.pl.umap(ad, color='celltype_minor')
sc.pl.umap(ad, color='disease')
sc.pl.umap(ad, color='donor_id')

In [None]:
patient_clinical_df = pd.read_csv('./Data/BRCA/Wu_etal_2021_BRCA/Patient_clinical.txt', sep='\t')
patient_clinical_df

In [None]:
ad.obs['donor_id'].value_counts()

In [None]:
# Step 1: Reset index for merging, but save cell IDs
ad.obs['tmp_donor_id'] = [i.replace('CID', '').replace('A', '').replace('N', '') for i in ad.obs['donor_id']]
patient_clinical_df['Case ID'] = patient_clinical_df['Case ID'].str.replace('-', '')


In [None]:
merged = ad.obs.reset_index().merge(
    patient_clinical_df,
    left_on='tmp_donor_id',
    right_on='Case ID',
    how='left'
)

# Step 2: Restore original index (cell barcodes)
merged = merged.set_index('index')

# Step 3: Assign back
ad.obs = merged
ad

In [None]:
ad.obs['Final_cancer_type'] = 'Breast Cancer'
ad.obs['Final_histological_subtype'] = ad.obs.disease
ad.obs['Final_molecular_subtype'] = ad.obs['subtype_by_IHC']
ad.obs['Final_tissue'] = 'Breast'
ad.obs['Final_sample_id'] = ad.obs.donor_id

In [None]:
ad.obs['Cancer Type'].value_counts()

In [None]:
ad.obs['Primary_or_Metastatic'] = ad.obs['Primary_or_Metastatic']

In [None]:
# add more clinical information
ad.obs['Final_patient_age'] = ad.obs['Age'].astype(int)
ad.obs['Final_patient_stage'] = ad.obs['Stage']
ad.obs['Final_patient_treatment'] = ad.obs['treatment_status'].astype(str) +' '+ad.obs['treatment_details'].astype(str)

In [None]:
ad.raw.shape

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/Wu_etal_2021_BRCA.BRCA.h5ad', compression='gzip')

In [None]:
ad

## A pan-cancer blueprint of the heterogeneous tumor microenvironment revealed by single-cell profiling


Paper: https://www.nature.com/articles/s41422-020-0355-0#Fig3

Data downloaded from: https://lambrechtslab.sites.vib.be/en/pan-cancer-blueprint-tumour-microenvironment-0

Link: 
- Matrix: https://lambrechtslab.sites.vib.be/en/pan-cancer-blueprint-tumour-microenvironment-0(Breast cancer - Counts Matrix)
- Patient metadata: https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-020-0355-0/MediaObjects/41422_2020_355_MOESM13_ESM.pdf
- Sequecing quality: https://www.nature.com/
https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-020-0355-0/MediaObjects/41422_2020_355_MOESM14_ESM.pdf

In [None]:
# Set the project directory
project_dir = "./Data/BRCA/2102-Breastcancer/2102-Breastcancer_counts/"  # <- change this for each project

# Load and preprocess
ad = load_and_preprocess_project(project_dir, 
                                 Project_ID='2102-Breastcancer', 
                                 Primary_or_Metastatic='Primary',
                                 further_pre=True)

In [None]:
ad.raw.shape

In [None]:
ad.obs['PatientNumber'] = ad.obs['PatientNumber'].astype(str)

In [None]:
sc.pl.umap(ad, color=['CellType'])
sc.pl.umap(ad, color=['PatientNumber'])
# sc.pl.umap(ad, color=['CellFromTumor'])
# sc.pl.umap(ad, color=['TumorSite'])
# sc.pl.umap(ad, color=['Project'])


In [None]:
ad = filter_and_recompute(adata=ad, 
                          celltype_col='CellType', 
                          celltypes_to_keep=['Cancer'],
                          further_pre=True)
ad

In [None]:
sc.pl.umap(ad, color=['TumorType', 'PatientNumber'])


In [None]:
patient_cell_number = pd.read_csv("./Data/BRCA/2102-Breastcancer/2102-Breastcancer_counts/2103-Breastcancer_metadata.csv")['PatientNumber'].value_counts()
patient_cell_number = patient_cell_number.to_dict()
patient_cell_number

In [None]:
patient_seuqncing_meta_df = pd.read_csv('./Data/BRCA/2102-Breastcancer/Sequencing_quality_S2.txt', sep='\t')
patient_seuqncing_meta_df = patient_seuqncing_meta_df[patient_seuqncing_meta_df['Cancer type'] == 'BC']
patient_seuqncing_meta_df

In [None]:
cells_to_bc_label = dict(zip(patient_seuqncing_meta_df['Cells'], patient_seuqncing_meta_df['Patient number']))
cells_to_bc_label

In [None]:
patient_number_to_BC_id = dict()
for patient_number in patient_cell_number.keys():
    patient_number_to_BC_id[str(patient_number)] = cells_to_bc_label[patient_cell_number[patient_number]]
patient_number_to_BC_id

In [None]:
ad.obs['BC_PatientID'] = ad.obs['PatientNumber'].map(patient_number_to_BC_id)
ad.obs

In [None]:
patient_meta_df = pd.read_csv('./Data/BRCA/2102-Breastcancer/Patient_metadata_S1.txt', sep='\t')
patient_meta_df = patient_meta_df[patient_meta_df['Tumor_type'] == 'BC']
# meta_subset = patient_meta_df[['Patient_number', 'Pathological_subtype', 'Molecular_status']]
patient_meta_df

In [None]:
ad.obs = ad.obs.merge(patient_meta_df, left_on='BC_PatientID', right_on='Patient_number', how='left')
ad.obs

In [None]:
ad.obs['Final_cancer_type'] = 'Breast Cancer'
ad.obs['Final_histological_subtype'] = ad.obs.Pathological_subtype
ad.obs['Final_molecular_subtype'] = ad.obs['Molecular_status']
ad.obs['Final_tissue'] = 'Breast'
ad.obs['Final_sample_id'] = ad.obs['BC_PatientID']

In [None]:
# add more clinical information
ad.obs['Final_patient_age'] = ad.obs['Age_range']
ad.obs['Final_patient_stage'] = ad.obs['TNM']
ad.obs['Final_patient_treatment'] = 'Naïve'

In [None]:
ad.raw.shape

In [None]:
ad

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/2102-Breastcancer.BRCA.h5ad', compression='gzip')

## Single cell profiling of primary and paired metastatic lymph node tumors in breast cancer patients

Paper: https://www.nature.com/articles/s41467-022-34581-2#data-availability

Data downloaded from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE167036

Link: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE167036



In [None]:
# Set the project directory
project_dir = "./Data/BRCA/GSE167036/"  # <- change this for each project

# Load and preprocess
ad = load_and_preprocess_project(project_dir, 
                                 metadata_idx_key='cell_id', 
                                 Project_ID='GSE167036',
                                 further_pre=True)
ad

In [None]:
sc.pl.umap(ad, color=['sample_type'])
sc.pl.umap(ad, color=['patient_id'])

sc.pl.umap(ad, color=['orig.ident'])

sc.pl.umap(ad, color=['cell_label'])

sc.pl.umap(ad, color=['main_label'])


In [None]:
ad = filter_and_recompute(adata=ad, 
                          celltype_col='main_label', 
                          celltypes_to_keep=['Epithelial Cells'],
                          further_pre=True)

In [None]:
sc.pl.umap(ad, color=['sample_type'])
sc.pl.umap(ad, color=['patient_id'])

sc.pl.umap(ad, color=['orig.ident'])

sc.pl.umap(ad, color=['cell_label'])

sc.pl.umap(ad, color=['main_label'])


In [None]:
ad

In [None]:
# sc.pl.umap(ad, color=['ER_statu'])
# sc.pl.umap(ad, color=['Her2_statu'])


In [None]:
def combined_label(row):
    er = row['ER_statu']
    her2 = row['Her2_statu']
    
    if er == "positive" and her2 == "positive":
        return "HER2+/ER+"
    elif er == "positive" and her2 == "Negative":
        return "ER+"
    elif er == "Negative" and her2 == "positive":
        return "HER2+"
    elif er == "Negative" and her2 == "Negative":
        return "HER2-/ER-"
    else:
        return np.nan  # handle any missing or undefined combinations


In [None]:
ad.obs['sample_type']

In [None]:
new_bc_tissue = []
for tissue in ad.obs['sample_type']:
    if tissue == 'Lymph Node':
        new_bc_tissue.append('Lymph node')
    elif tissue == 'Tumor':
        new_bc_tissue.append('Breast')
    else:
        print('Wrong')
            


In [None]:
ad.obs['Final_cancer_type'] = 'Breast Cancer'
ad.obs['Final_histological_subtype'] = 'Invasive ductal carcinoma'
ad.obs['Final_molecular_subtype'] = ad.obs.apply(combined_label, axis=1)
ad.obs['Final_tissue'] = new_bc_tissue
ad.obs['Final_sample_id'] = ad.obs['patient_id']

In [None]:
ad.obs['Primary_or_Metastatic'] = 'Metastatic'

In [None]:
# add more clinical information
ad.obs['Final_patient_age'] = 'Unknown'
ad.obs['Final_patient_stage'] = 'Unknown'
ad.obs['Final_patient_treatment'] = 'Naïve'

In [None]:
ad.raw.shape

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/GSE167036.BRCA.h5ad', compression='gzip')

## A single‐cell RNA expression atlas of normal, preneoplastic and tumorigenic states in the human breast


Paper: https://www.embopress.org/doi/full/10.15252/embj.2020107333

Data downloaded from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE161529

Link: https://figshare.com/articles/dataset/Data_R_code_and_output_Seurat_Objects_for_single_cell_RNA-seq_analysis_of_human_breast_tissues/17058077?file=31544843 (HumanBreast10X.zip)

https://figshare.com/articles/dataset/Data_R_code_and_output_Seurat_Objects_for_single_cell_RNA-seq_analysis_of_human_breast_tissues/17058077?file=31546559

In [None]:
data_dir = './Data/BRCA/GSE161529/GSE161529_RDS/'
all_h5_files = [i for i in os.listdir(data_dir) if i.endswith('ad') ]
all_h5_files.sort()
all_h5_files

In [None]:
ad_list = []
for f in all_h5_files:
    if not f.__contains__('Tum'):
        print(data_dir+f)
        # continue
        ad = sc.read_h5ad(data_dir+f)
        print(ad)
        ad.raw.var.index = ad.raw.var['_index']

        ad.obs['Project_ID'] = 'GSE161529_'+f.split('_')[-1].split('.')[0].replace('Total', '')
        ad.obs['Primary_or_Metastatic'] = 'Primary'
        #ad = reprocess_from_raw_layer(ad, 
        #                              Project_ID='GSE161529_'+f.split('_')[-1].split('.')[0].replace('Total', ''), 
        #                              Primary_or_Metastatic='Primary')
        ad.obs['seurat_clusters'] =ad.obs['seurat_clusters'].astype(str)
        #sc.pl.umap(ad, color=['seurat_clusters', 'group'])
        tum_ad = sc.read_h5ad(data_dir+f.replace('.h5ad', 'Tum.h5ad'))
        print(tum_ad)
        # keep only tumor cells
        ad = ad[ad.obs_names.isin(set(tum_ad.obs_names))]
        print(ad)
        # ad = recompute_pca_and_umap(ad)
        # sc.pl.umap(ad, color=['seurat_clusters', 'group'])
        ad.obs['Final_cancer_type'] = 'Breast Cancer'
        ad.obs['Final_histological_subtype'] = 'N/A'
        ad.obs['Final_molecular_subtype'] = f.split('_')[-1].split('.')[0].replace('Total', '+')
        ad.obs['Final_tissue'] = 'Breast'
        ad.obs['Final_sample_id'] =  ad.obs.group
        print(ad.obs.group)
        print(f.split('_')[-1].split('.')[0].replace('Total', ''))

        ad_list.append(ad)
        # display(ad.to_df())
        print(ad)
        print(ad.raw.shape)
        

In [None]:
# process the lymph node samples
for f in all_h5_files:
    if f.__contains__('TumLN'):
        print(data_dir+f)        
        ad = sc.read_h5ad(data_dir+f)
        print(ad)
        ad.raw.var.index = ad.raw.var['_index']
        # ad = reprocess_from_raw_layer(ad, 
        #                               Project_ID='GSE161529_LN', 
        #                               Primary_or_Metastatic='Metastatic')
        # break
        # ad = ad[~ad.obs.group.str.endswith('_T')]
        ad.obs['Project_ID'] = 'GSE161529_LN'
        ad.obs['Primary_or_Metastatic'] = 'Metastatic'
        ad.obs['seurat_clusters'] =ad.obs['seurat_clusters'].astype(str)
        
        ad.obs['Final_cancer_type'] = 'Breast Cancer'
        ad.obs['Final_histological_subtype'] = 'N/A'
        ad.obs['Final_molecular_subtype'] = 'ER+'
        # ad.obs['Final_tissue'] = 'Lymph Node'
        ad.obs['Final_sample_id'] =  ad.obs.group
        tissues = []
        for group in ad.obs.group:
            if group.endswith('_T'):
                tissues.append('Breast')
            elif group.endswith('_LN'):
                tissues.append('Lymph Node')
            else:
                print(group)
        ad.obs['Final_tissue'] = tissues
        # ad[ad.obs.group.str.endswith('_T')].obs['Final_tissue'] = 'Breast'
        
        print(f.split('_')[-1].split('.')[0].replace('Total', ''))

        # display(ad.to_df())
        ad_list.append(ad)
        print(ad)
        print(ad.raw.shape)

In [None]:
all_sample_num = 0
for ad in ad_list:
    all_sample_num += ad.n_obs
all_sample_num

In [None]:
combined_ad = ann.concat(ad_list, join="inner", axis=0)
combined_ad

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import issparse

# Step 1: Find duplicated obs_names
duplicated_obs = combined_ad.obs_names[combined_ad.obs_names.duplicated()]

if len(duplicated_obs) == 0:
    print("✅ No duplicated obs_names found.")
else:
    print(f"🔍 Found {len(duplicated_obs)} duplicated obs_names.\n")
    '''
    for obs_name in tqdm(duplicated_obs.unique()):
        indices = np.where(combined_ad.obs_names == obs_name)[0]
        exprs = combined_ad.X[indices]

        if issparse(exprs):
            exprs_array = exprs.toarray()
        else:
            exprs_array = exprs

        # Compare all rows to the first row
        all_equal = np.all(exprs_array == exprs_array[0], axis=1).all()
        if not all_equal:
            print(obs_name)
        # print(f"{obs_name}: {'✅ Identical' if all_equal else '❌ Different'}")
    '''

In [None]:
import pandas as pd

# Step 1: Backup the original obs_names
original_obs_names = combined_ad.obs_names.copy()

# Step 2: Assign temporary unique index to avoid conflict during filtering
combined_ad.obs_names = pd.Index([f"{i}_{name}" for i, name in enumerate(original_obs_names)])

# Step 3: Add original name into .obs for filtering
combined_ad.obs["original_obs_name"] = original_obs_names

# Step 4: Sort so 'Metastatic' comes first
obs_df = combined_ad.obs.sort_values(by="Primary_or_Metastatic", ascending=True)

# Step 5: Drop duplicates on the original obs name (keeping 'Metastatic' if it exists)
obs_dedup = obs_df.drop_duplicates(subset="original_obs_name", keep="first")

obs_dedup

In [None]:
# Step 6: Subset the AnnData object using unique (temporary) obs_names
print(combined_ad)
combined_ad = combined_ad[obs_dedup.index].copy()

# Step 7: Restore the original obs_names (if needed)
combined_ad.obs_names = combined_ad.obs["original_obs_name"]
combined_ad.obs.drop(columns="original_obs_name", inplace=True)
combined_ad

In [None]:
patient_clinical_df = pd.read_csv('./Data/BRCA/GSE161529/Patient_clinical.txt', sep='\t')
patient_clinical_df

In [None]:
np.unique(combined_ad.obs.Final_sample_id)

In [None]:
# Step 1: Reset index for merging, but save cell IDs
combined_ad.obs['tmp_donor_id'] = [i.lower().replace('_', '-') for i in combined_ad.obs.Final_sample_id]
patient_clinical_df['Case ID'] = [i.lower().replace('_', '-') for i in patient_clinical_df['Specimen ID']]


In [None]:
combined_ad.obs

In [None]:
merged = combined_ad.obs.reset_index().merge(
    patient_clinical_df,
    left_on='tmp_donor_id',
    right_on='Case ID',
    how='left'
)

# Step 2: Restore original index (cell barcodes)
merged = merged.set_index('original_obs_name')

# Step 3: Assign back
combined_ad.obs = merged
combined_ad

In [None]:
combined_ad.obs

In [None]:
all_sample_id = np.unique(combined_ad.obs['Final_sample_id'])
mets_sample = set()
for sample_id in all_sample_id:
    if sample_id.__contains__('_T'):
        if sample_id.replace('_T', '_LN') in all_sample_id:
            print(sample_id)
            mets_sample.add(sample_id)
mets_sample = list(mets_sample)
mets_sample.sort()

In [None]:
mets_sample

In [None]:
primary_or_metastatic = []
for sample in mets_sample:
    print(sample)
    print(np.unique(combined_ad[combined_ad.obs['Final_sample_id']==sample].obs['Primary_or_Metastatic']))
    print(np.unique(combined_ad[combined_ad.obs['Final_sample_id']==sample].obs['Final_tissue']))

In [None]:
# add more clinical information
combined_ad.obs['Final_patient_age'] = combined_ad.obs['Patient Age']
combined_ad.obs['Final_patient_stage'] = 'Unknown'
combined_ad.obs['Final_patient_treatment'] = 'Naïve'

In [None]:
combined_ad.obs['Project_ID'] = 'GSE161529'

In [None]:
combined_ad.write_h5ad('./Data/Cancer_cell_data/GSE161529.BRCA.h5ad', compression='gzip')

In [None]:
del ad_list, combined_ad

## A single-cell map of intratumoral changes during anti-PD1 treatment of patients with breast cancer


Paper: https://www.nature.com/articles/s41591-021-01323-8#Fig1

Data downloaded from: httpshttps://lambrechtslab.sites.vib.be/en/single-cell

Link: 
- Matrix: https://vib.blob.core.windows.net/vib-forms/Documents/1867-counts_cells_cohort2.rds?sv=2019-07-07&sr=b&sig=mFBRJlKUM8OQDNfD6WKStiVGs3BTa7O3wepBAv%2FHalQ%3D&se=2025-07-10T17%3A04%3A47Z&sp=r
- Patient metadata: https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-021-01323-8/MediaObjects/41591_2021_1323_MOESM1_ESM.pdf

In [None]:
memory_usgae()

In [None]:
# Set the project directory
project_dir = "./Data/BRCA/Single_cell_map_anti-PD1//"  # <- change this for each project

# Load and preprocess
ad = load_and_preprocess_project(project_dir, 
                                 metadata_idx_key='Cell', 
                                 Project_ID='Single_cell_map_anti-PD1',
                                 further_pre=False)
ad

In [None]:
memory_usgae()

In [None]:
file_path = './Data/BRCA/Single_cell_map_anti-PD1/1867-counts_cells_cohort2.csv'

# Step 1: Read just the first line to get cell names
with open(file_path) as f:
    header = f.readline().strip('').split(',')
cell_names = header[1:]  # skip the gene column
cell_names = [name.strip('"') for name in cell_names]

print(cell_names[:10])
# Step 2: Read file in chunks and store rows
gene_names = []
data = []

In [None]:
# count = 0
gene_names = []
data = []
with open(file_path) as f:
    next(f)  # skip header
    for line in tqdm(f, desc="Reading rows"):
        parts = line.strip().split(',')
        gene = parts[0]
        counts = np.array(parts[1:], dtype=np.float32)
        gene_names.append(gene)
        data.append(counts)

gene_names = [name.strip('"') for name in gene_names]
print(gene_names[:10])

# Step 3: Stack into matrix and transpose
dense_matrix = np.vstack(data)  # shape: (genes, cells)
transposed = sp.csr_matrix(dense_matrix.T)  # shape: (cells, genes)

# Step 4: Create AnnData
ad_2 = ann.AnnData(X=transposed,
                   obs=pd.DataFrame(index=cell_names),
                   var=pd.DataFrame(index=gene_names))
ad_2

In [None]:
combined_ad = ann.concat([ad, ad_2], join="inner", axis=0)

In [None]:
del ad, ad_2
memory_usgae()

In [None]:
meta_1 = pd.read_csv('./Data/BRCA/Single_cell_map_anti-PD1/1872-BIOKEY_metaData_cohort1_web.csv', index_col=0)
meta_1

In [None]:
meta_2 = pd.read_csv('./Data/BRCA/Single_cell_map_anti-PD1/1871-BIOKEY_metaData_cohort2_web.csv', index_col=0)
meta_2

In [None]:
memory_usgae()

In [None]:
meta_df = pd.concat([meta_1, meta_2])
meta_df

In [None]:
combined_ad.obs_names = combined_ad.obs_names.str.strip()
combined_ad.obs_names = combined_ad.obs_names.str.strip('"')

combined_ad.obs_names

In [None]:
combined_ad.obs = meta_df.loc[combined_ad.obs_names]

In [None]:
memory_usgae()
combined_ad

In [None]:
combined_ad.obs['cellType'].value_counts()

In [None]:
combined_ad = filter_and_recompute(adata=combined_ad, 
                          celltype_col='cellType', 
                          celltypes_to_keep=['Cancer_cell'],
                          further_pre=True)
combined_ad

In [None]:
sc.pl.umap(combined_ad, color=['cohort', 'BC_type', 'patient_id', 'timepoint'])

In [None]:
patient_clinical_df = pd.read_csv('./Data/BRCA/Single_cell_map_anti-PD1/patient_clinical.txt', sep='\t')
patient_clinical_df

In [None]:
combined_ad.obs['new_id'] = [int(i.split('_')[-1]) for i in combined_ad.obs['patient_id']]

In [None]:
# Step 1: Reset index for merging, but save cell IDs
merged = combined_ad.obs.reset_index().merge(
    patient_clinical_df,
    left_on='new_id',
    right_on='ID',
    how='left'
)

# Step 2: Restore original index (cell barcodes)
merged = merged.set_index('index')

# Step 3: Assign back
combined_ad.obs = merged
combined_ad.obs

In [None]:
# add more clinical information

combined_ad.obs['Final_cancer_type'] = 'Breast Cancer'
combined_ad.obs['Final_histological_subtype'] = combined_ad.obs['Histological_type']
combined_ad.obs['Final_molecular_subtype'] = combined_ad.obs['Type']
combined_ad.obs['Final_tissue'] = 'Breast'
combined_ad.obs['Final_sample_id'] = combined_ad.obs['patient_id']
combined_ad.obs['Final_patient_age'] = combined_ad.obs['Age_category']
combined_ad.obs['Final_patient_stage'] = combined_ad.obs['pTNM']
combined_ad.obs['Final_patient_treatment'] = combined_ad.obs['cohort']

In [None]:
combined_ad.obs['Project_ID'] = 'Single_cell_map_anti-PD1'
combined_ad.obs['Primary_or_Metastatic'] = 'Primary'

In [None]:
combined_ad.raw = combined_ad.copy()

In [None]:
combined_ad.write_h5ad('./Data/Cancer_cell_data/Single_cell_map_anti-PD1.BRCA.h5ad', compression='gzip')

In [None]:
combined_ad.raw.shape

## Combined Single-Cell and Spatial Transcriptomics Reveal the Metabolic Evolvement of Breast Cancer during Early Dissemination

Paper: https://advanced.onlinelibrary.wiley.com/doi/10.1002/advs.202205395

Data downloaded from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE225600

Link: 
- Matrix: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE225600&format=file&file=GSE225600%5Fsc%5Fmatrix%2Emtx%2Egz
- Annotation: https://advanced.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fadvs.202205395&file=advs5030-sup-0003-DatasetS2.xlsx
- Patient clinical: https://advanced.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fadvs.202205395&file=advs5030-sup-0002-DatasetS1.xlsx

In [None]:
# Set the project directory
project_dir = "./Data/BRCA/GSE225600/"  # <- change this for each project

# Load and preprocess
ad = load_and_preprocess_project(project_dir, 
                                 metadata_idx_key='Barcode', 
                                 Project_ID='GSE225600',
                                 Primary_or_Metastatic='Metastatic',
                                 further_pre=False)
ad

In [None]:
meta_df = pd.read_csv('./Data/BRCA/GSE225600/metadta.tsv', sep='\t')
# Remove "-1" (or any dash-number) from Barcode first
meta_df['Barcode_clean'] = meta_df['Barcode'].str.replace(r'-\d+$', '', regex=True)

# Then concatenate with sampleid
meta_df['new_name'] = meta_df['Barcode_clean'] + '-' + meta_df['sampleid']

meta_df = meta_df.set_index('new_name')
meta_df

In [None]:
shared_cells = list(set(ad.to_df().index).intersection(set(meta_df.index)))
shared_cells.sort()
len(shared_cells)

In [None]:
ad = ad[shared_cells]
ad.obs = meta_df.loc[ad.to_df().index]
ad.obs

In [None]:
ad.obs.celltype.value_counts()

In [None]:
len(set(ad.var_names))

In [None]:
from collections import Counter

counts = Counter(ad.var_names)
duplicates = [k for k, v in counts.items() if v > 1]
print("Duplicated gene names:", duplicates)


In [None]:
ad.var_names_make_unique()
ad

In [None]:
ad.to_df().max(axis=1)

In [None]:
ad.raw = ad.copy()
ad.raw.shape

In [None]:
len(set(ad.raw.var_names))

In [None]:
ad = filter_and_recompute(adata=ad, 
                          celltype_col='celltype', 
                          celltypes_to_keep=['Epithelial cell'],
                          further_pre=True)
ad

In [None]:
sc.pl.umap(ad, color=['sampleid', 'clusters', 'group'])

In [None]:
len(set(ad.var_names))

In [None]:
len(set(ad.raw.var_names))

In [None]:
patient_clinical_df = pd.read_csv('./Data/BRCA/GSE225600/patient_clinical.txt', sep='\t')
patient_clinical_df

In [None]:
# add more clinical information
ad.obs['Project_ID'] = 'GSE225600'
ad.obs['Primary_or_Metastatic'] = 'Metastatic'
ad.obs['Final_cancer_type'] = 'Breast Cancer'
ad.obs['Final_histological_subtype'] = 'Invasive ductal carcinoma'
ad.obs['Final_molecular_subtype'] = 'Unknown'
ad.obs['Final_tissue'] = 'Breast'
ad.obs['Final_sample_id'] = ad.obs['sampleid']
ad.obs['Final_patient_age'] = 'Unknown'
ad.obs['Final_patient_stage'] = 'Unknown'
ad.obs['Final_patient_treatment'] = 'Naive'

In [None]:
ad

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/GSE225600.BRCA.h5ad', compression='gzip')

## Integrate the data

In [None]:
data_dir = './Data/Cancer_cell_data/'
all_h5_files = os.listdir(data_dir)
all_h5_files.sort()

all_h5_files

In [None]:
cancer_ad_list = []
for h5 in all_h5_files:
    if h5.__contains__('ntegrated'):
        continue
    if not h5.__contains__('BRCA'):
        continue
    print(h5)
    # continue
    tmp_ad = sc.read_h5ad(data_dir+h5)
    if h5.__contains__('2102-Breastcancer'):
        tmp_ad.obs_names = tmp_ad.obs['Cell']
    # break
    
    if tmp_ad.var_names[0].startswith('ENSG'):
        tmp_ad.var_names = tmp_ad.var.feature_name
        tmp_ad.raw.var.index = tmp_ad.var.feature_name
        # tmp_ad.raw.var_names = tmp_ad.var.feature_name
    cancer_ad_list.append(tmp_ad)
    tmp_ad.X = tmp_ad.raw.X.copy()
    print(cancer_ad_list[-1])
    display(tmp_ad.to_df())

In [None]:
for ad in cancer_ad_list:
    # print(ad)
    print(ad.shape)
    print(ad.raw.shape)

for i, ad in enumerate(cancer_ad_list):
    if ad.raw is not None:
        if not ad.raw.var_names.is_unique:
            print(f"AnnData {i} has duplicate raw.var_names!")
            print(ad.obs['Project_ID'])


In [None]:
memory_usgae()

In [None]:
combined_ad = ann.concat(cancer_ad_list, join="inner", axis=0)
combined_ad

In [None]:
combined_ad.raw.shape

In [None]:
combined_ad = reprocess_all(combined_ad)

In [None]:
combined_ad

In [None]:
combined_ad.obs["Final_histological_subtype"].value_counts()

In [None]:
combined_ad.obs["Final_histological_subtype_backup"] = combined_ad.obs["Final_histological_subtype"].copy()


In [None]:
def unify_histological_subtype(subtype):
    subtype = str(subtype).lower()
    if "ductal" in subtype and "lobular" in subtype:
        return "BRCA: Mixed ductal/lobular carcinoma"
    elif 'ibc' in subtype:
        return 'BRCA: Invasive breast carcinoma'
    elif "invasive ductal" in subtype or "innvasive ductal" in subtype:
        return "BRCA: Invasive ductal carcinoma"
    elif "lobular" in subtype or 'ilc' in subtype:
        return "BRCA: Invasive lobular carcinoma"
    elif "mucinous" in subtype:
        return "BRCA: Invasive mucinous carcinoma"
    elif "metaplastic" in subtype:
        return "BRCA: Metaplastic carcinoma"
    elif "apocrine" in subtype:
        return "BRCA: Invasive apocrine carcinoma"
    elif "malignant neoplasm" in subtype:
        return "BRCA: Malignant neoplasm (unspecified)"
    elif "breast carcinoma" in subtype:
        return "BRCA: Breast carcinoma (unspecified)"
    elif "metastatic" in subtype:
        return "BRCA: Metastatic carcinoma"
    elif subtype == "n/a" or subtype.strip() in {"", "nan"}:
        return "BRCA: Unspecified"
    else:
        return subtype.strip().capitalize()

combined_ad.obs["Final_histological_subtype"] = combined_ad.obs["Final_histological_subtype_backup"].apply(unify_histological_subtype)
combined_ad.obs['Final_histological_subtype'].value_counts()

In [None]:
combined_ad.obs['Final_molecular_subtype'].value_counts()

In [None]:
combined_ad.obs["Final_molecular_subtype_backup"] = combined_ad.obs["Final_molecular_subtype"].copy()


In [None]:
# Define mapping dictionary
def unify_molecular_subtype(val):
    val = str(val).strip().lower()
    if val in {"tnbc", "triple negative", "er-/pr-/her2-"}:
        return "BRCA: Triple Negative"
    elif val in {"her2", "her2+", "her2 positive"}:
        return "BRCA: HER2+"
    elif val in {"er+/pr+/her2-", "luminal a-like"}:
        return "BRCA: ER+/PR+/HER2-"
    elif val in {"luminal b-like", "luminal-her2+", "er+/pr+/her2+", "her2+/er+"}:
        return "BRCA: ER+/PR+/HER2+"
    elif val == "er+/pr+/":
        return "BRCA: ER+"
    elif val == "er+":
        return "BRCA: ER+"
    elif val == "er+/pr-/her2-":
        return "BRCA: ER+"
    elif val == "er-/pr-/her2+":
        return "BRCA: HER2+"
    elif val == "nan" or val in {"", "none"}:
        return "BRCA: Unspecified"
    elif val == "unknown":
        return "BRCA: Unspecified"
    else:
        return val.capitalize()

# Apply mapping
combined_ad.obs['Final_molecular_subtype'] = combined_ad.obs['Final_molecular_subtype_backup'].apply(unify_molecular_subtype)

# Optional: check new values
print(combined_ad.obs['Final_molecular_subtype'].value_counts())


In [None]:
combined_ad.obs['Final_tissue'] .value_counts()

In [None]:
combined_ad.obs['Final_tissue'] = combined_ad.obs['Final_tissue'].astype(str).str.capitalize()

In [None]:
combined_ad.obs['Final_tissue'] .value_counts()

In [None]:
combined_ad.obs["Final_patient_age_backup"] = combined_ad.obs["Final_patient_age"]


In [None]:

def clean_patient_age(age):
    if pd.isna(age):
        return np.nan
    age = str(age).strip()
    if age.lower() == "unknown":
        return np.nan
    elif "-" in age:
        # Convert age ranges like '46-50' to their midpoint
        parts = age.split("-")
        try:
            return int((int(parts[0]) + int(parts[1])) / 2)
        except:
            return np.nan
    else:
        try:
            return int(age)
        except:
            return np.nan

# Apply cleaning
combined_ad.obs["Final_patient_age"] = combined_ad.obs["Final_patient_age_backup"].apply(clean_patient_age)


In [None]:
combined_ad

In [None]:
for obs in ['Project_ID', 'Primary_or_Metastatic', 'Final_cancer_type', 'Final_histological_subtype', 
            'Final_molecular_subtype', 'Final_tissue', 'Final_patient_age', 'Final_patient_stage', 'Final_patient_treatment']:
    sc.pl.umap(combined_ad, color=obs)

### Harmony integration

In [None]:
combined_ad

In [None]:
Z = harmonize(combined_ad.obsm['X_pca'], combined_ad.obs, batch_key = ['Project_ID'])


In [None]:
combined_ad.obsm['X_pca_harmony'] = Z


In [None]:
sc.pp.neighbors(combined_ad, n_neighbors=15, use_rep='X_pca_harmony')
sc.tl.umap(combined_ad)

In [None]:
for obs in ['Project_ID', 'Primary_or_Metastatic', 'Final_cancer_type', 'Final_histological_subtype', 
            'Final_molecular_subtype', 'Final_tissue', 'Final_patient_age', 'Final_patient_stage', 'Final_patient_treatment']:
    sc.pl.umap(combined_ad, color=obs)

In [None]:
combined_ad.obs['Final_patient_age_backup'] = combined_ad.obs['Final_patient_age_backup'].astype(str)


In [None]:

combined_ad.write_h5ad('./Data/Cancer_cell_data/BRCA_integrated.harmony.h5ad', compression='gzip')
