# Load library

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import anndata as ann
import random, os
from scipy.stats import pearsonr as pr
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score as f1
from sklearn.metrics import precision_recall_curve as prc
from sklearn.metrics import silhouette_score as sil
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, average_precision_score
from sklearn.metrics import silhouette_score
from torch_geometric.nn import TransformerConv
from torch_geometric.data import Data
import psutil
import os, sys
import gc
import scipy.sparse as sp
from harmony import harmonize
from tqdm import tqdm
import h5py

In [None]:
sc.set_figure_params(dpi=200)

# General processing functinos

In [None]:
def whats_memory_eater():
    # Build reverse map of object id -> variable name from globals
    name_map = {id(obj): name for name, obj in globals().items()}

    # Get all tracked objects
    all_objects = gc.get_objects()

    # Safely get size and match variable name
    sizes = []
    for obj in all_objects:
        try:
            size = sys.getsizeof(obj)
            obj_id = id(obj)
            name = name_map.get(obj_id, None)
            sizes.append((size, type(obj), name, repr(obj)[:100]))
        except Exception:
            continue

    # Sort and print top 10
    sizes.sort(reverse=True, key=lambda x: x[0])

    for size, obj_type, name, preview in sizes[:10]:
        print(f"Size: {size / 1024**3} GB | Type: {obj_type} | Name: {name} | Object: {preview}")


In [None]:
def memory_usgae():
    gc.collect()
    process = psutil.Process(os.getpid())
    memory_gb = process.memory_info().rss / 1024**3  # in GB

    print(f"Current memory usage: {memory_gb:.2f} GB")

In [None]:
def pca_and_umap(adata):
    sc.tl.pca(adata, svd_solver="arpack")
    # sc.pl.pca(ad, color='source')
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)

In [None]:
def load_and_preprocess_project(base_path, Project_ID, metadata_idx_key='Cell', Primary_or_Metastatic = 'Primary', further_pre = False, file_prefix= None):
    """
    Load and preprocess a single scRNA-seq project with standard filtering and UMAP.
    
    Assumes the base_path contains:
        - One .mtx file (count matrix)
        - One barcodes.csv
        - One features.csv
        - One meta_all.csv
    """
    # Automatically detect files
    files = os.listdir(base_path)
    metadata_file = None
    
    if file_prefix == None:

        mtx_file = [os.path.join(base_path, f) for f in files if f.endswith('.mtx')][0]
        print(mtx_file)
        barcodes_file = [os.path.join(base_path, f) for f in files if 'barcode' in f][0]
        print(barcodes_file)
        try:
            features_file = [os.path.join(base_path, f) for f in files if 'feature' in f][0]
        except:
            features_file = [os.path.join(base_path, f) for f in files if 'genes' in f][0]
        print(features_file)
        metadata_file = [os.path.join(base_path, f) for f in files if 'meta' in f][0]
        print(metadata_file)
    else:
        for f in files:
            if not f.startswith(file_prefix):
                continue
            if f.endswith('mtx'):
                mtx_file = os.path.join(base_path, f)
                print(mtx_file)
            elif 'barcode' in f:
                barcodes_file = os.path.join(base_path, f)
                print(barcodes_file)
            elif 'feature' in f:
                features_file = os.path.join(base_path, f)
                print(features_file)
            elif 'genes' in f:
                features_file = os.path.join(base_path, f)
                print(features_file)
            elif 'meta' in f:
                metadata_file = os.path.join(base_path, f)
                print(metadata_file)
            else:
                continue
    # print(metadata_file)
    print(f"Loading: {mtx_file}")

    # Load matrix
    adata = sc.read_mtx(mtx_file)
    adata = adata.transpose()  # Important: make cells as rows, genes as columns

    # Load barcodes and features
    if barcodes_file.endswith('tsv'):
        barcodes = pd.read_csv(barcodes_file, sep='\t', header=None)  # no header=None here
    else:
        barcodes = pd.read_csv(barcodes_file)  # no header=None here
    display(barcodes)
    
    if features_file.endswith('tsv'):
        genes = pd.read_csv(features_file, sep='\t', header=None)  # no header=None here
    else:
        genes = pd.read_csv(features_file)  # no header=None here
    display(genes)

    # Assign barcodes and gene names (convert to string)
    if barcodes.shape[1] > 1:
        adata.obs_names = barcodes.iloc[:, 1].astype(str).values
    else:
        adata.obs_names = barcodes.iloc[:, 0].astype(str).values
    
    if genes.shape[1] > 1:
        adata.var_names = genes.iloc[:, 1].astype(str).values
    else:
        adata.var_names = genes.iloc[:, 0].astype(str).values
    # adata.var_names = genes.iloc[:, 0].astype(str).values
    display(adata.to_df())

    # Load and merge metadata
    # if metadata_file
    try:
        if metadata_file.endswith('tsv'):
            metadata = pd.read_csv(metadata_file, sep='\t')
        elif metadata_file.endswith('csv'):
            metadata = pd.read_csv(metadata_file)
        metadata.index = metadata[metadata_idx_key]
        adata.obs = adata.obs.join(metadata, how='left')
    except:
        pass         

    # Add project info
    # project_name = os.path.basename(base_path)
    # adata.obs['Project'] = project_name
    
    # ======== Standard preprocessing ========

    # Calculate QC metrics
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    # Standard cell filtering
    adata = adata[(adata.obs['n_genes_by_counts'] >= 200) & 
                  (adata.obs['n_genes_by_counts'] <= 5000) & 
                  (adata.obs['pct_counts_mt'] <= 20)].copy()
    
    adata.obs['Project_ID'] = Project_ID
    adata.obs['Primary_or_Metastatic'] = Primary_or_Metastatic
    
    # Normalize and log transform
    adata.raw = adata.copy()
    
    if further_pre:
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        # Highly variable genes
        # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

        # Keep only HVGs
        # adata = adata[:, adata.var.highly_variable]

        # Scale
        sc.pp.scale(adata, max_value=10)

        # PCA
        sc.tl.pca(adata, svd_solver='arpack')

        # Neighbors
        sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)

        # UMAP
        sc.tl.umap(adata)

    print(f"Finished processing {Project_ID}. Shape: {adata.shape}")

    return adata


In [None]:
def filter_and_recompute(adata, celltype_col, celltypes_to_keep, further_pre = False):
    """
    Filters an AnnData object to keep only specified cell types, 
    then recalculates PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object
    - celltype_col: str, the column in adata.obs containing cell type annotations
    - celltypes_to_keep: list of str, the cell types you want to keep

    Returns:
    - filtered and recalculated AnnData object
    """
    # Step 1: Filter cells
    print(f"Original shape: {adata.shape}")
    adata_filtered = adata[adata.obs[celltype_col].isin(set(celltypes_to_keep))].copy()
    print(f"Filtered shape: {adata_filtered.shape}")

    # Step 2: Recalculate PCA and UMAP
    # (Assumes data is already normalized and scaled)
    if further_pre:
        sc.tl.pca(adata_filtered, svd_solver='arpack')
        sc.pp.neighbors(adata_filtered, n_neighbors=15, n_pcs=40)
        sc.tl.umap(adata_filtered)

    print("Recalculated PCA and UMAP.")
    return adata_filtered

In [None]:
def recompute_pca_and_umap(adata):
    """
    Filters an AnnData object to keep only specified cell types, 
    then recalculates PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object
    - celltype_col: str, the column in adata.obs containing cell type annotations
    - celltypes_to_keep: list of str, the cell types you want to keep

    Returns:
    - filtered and recalculated AnnData object
    """
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
    sc.tl.umap(adata)

    print("Recalculated PCA and UMAP.")
    return adata

In [None]:
def reprocess_from_raw_layer(adata, Project_ID, Primary_or_Metastatic = 'Primary', further_pre = False):
    """
    Reprocess a Scanpy AnnData object using its raw layer (e.g., from a published .h5ad).
    This includes normalization, HVG selection, PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object, must have .raw set

    Returns:
    - Processed AnnData object (modifies in place)
    """

    # Check if raw exists
    if adata.raw is None:
        raise ValueError("AnnData object has no .raw attribute. Cannot proceed with reprocessing.")

    # Extract raw counts
    adata.X = adata.raw.X.copy()
    adata.var = adata.raw.var.copy()
    adata.var_names = adata.raw.var_names.copy()

    # Recalculate mitochondrial content
    adata.var['mt'] = adata.var_names.str.upper().str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    # Standard filtering (optional)
    adata = adata[(adata.obs['n_genes_by_counts'] >= 200) &
                  (adata.obs['n_genes_by_counts'] <= 5000) &
                  (adata.obs['pct_counts_mt'] <= 20)].copy()
    
    adata.obs['Project_ID'] = Project_ID
    adata.obs['Primary_or_Metastatic'] = Primary_or_Metastatic
    
    if further_pre:
        # Normalize and log transform
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        # HVG selection
        # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        # adata = adata[:, adata.var.highly_variable]

        # Scale
        sc.pp.scale(adata, max_value=10)

        # PCA, neighbors, UMAP
        sc.tl.pca(adata, svd_solver='arpack')
        sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
        sc.tl.umap(adata)

    print(f"Reprocessed dataset. Final shape: {adata.shape}")
    return adata


In [None]:
def reprocess_all(adata, further_pre = True):
    """
    Reprocess a Scanpy AnnData object using its raw layer (e.g., from a published .h5ad).
    This includes normalization, HVG selection, PCA, neighbors, and UMAP.

    Parameters:
    - adata: AnnData object, must have .raw set

    Returns:
    - Processed AnnData object (modifies in place)
    """

    # Check if raw exists
    if adata.raw is None:
        raise ValueError("AnnData object has no .raw attribute. Cannot proceed with reprocessing.")

    # Extract raw counts
    adata.X = adata.raw.X.copy()
    adata.var = adata.raw.var.copy()
    adata.var_names = adata.raw.var_names.copy()

    # Recalculate mitochondrial content
    adata.var['mt'] = adata.var_names.str.upper().str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    
    print('Standard filtering...')
    # Standard filtering (optional)
    adata = adata[(adata.obs['n_genes_by_counts'] >= 200) &
                  (adata.obs['n_genes_by_counts'] <= 5000) &
                  (adata.obs['pct_counts_mt'] <= 20)].copy()
    
    # adata.obs['Project_ID'] = Project_ID
    # adata.obs['Primary_or_Metastatic'] = Primary_or_Metastatic
    
    if further_pre:
        # Normalize and log transform
        print('Normalizing...')
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        # HVG selection
        # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        # adata = adata[:, adata.var.highly_variable]
        '''
        sc.pp.highly_variable_genes(
            adata,
            flavor="seurat_v3",  # best for batch-aware HVG selection
            n_top_genes=2000,
            batch_key="Final_sample_id"  # or whatever your batch label column is
        )
        '''
        
        # adata = adata[:, adata.var.highly_variable].copy()

        # Scale
        # print('Scaling...')
        # sc.pp.scale(adata, max_value=10)
        print('Computing PCA...')
        # PCA, neighbors, UMAP
        sc.tl.pca(adata, zero_center=False)
        
        print('Computing neighbors...')
        sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
        
        print('Computing UMAP...')
        sc.tl.umap(adata)

    print(f"Reprocessed dataset. Final shape: {adata.shape}")
    return adata


# Lung Cancer (LUCA)

## High-resolution single-cell atlas reveals diversity and plasticity of tissue-resident neutrophils in non-small cell lung cancer

Paper: https://www.sciencedirect.com/science/article/pii/S1535610822004998?via%3Dihub

Data downloaded from: https://cellxgene.cziscience.com/collections/edb893ee-4066-4128-9aec-5eb2b03f8287

Link: 
- Anndata (extended): https://datasets.cellxgene.cziscience.com/80b57568-2621-4911-b4b1-4f2cf5087962.h5ad
- Anndata (core): https://datasets.cellxgene.cziscience.com/f27535dc-7902-456d-b94f-024dfe8791c0.h5ad


In [None]:
memory_usgae()

In [None]:
ad = sc.read_h5ad('./Data/LUAD/High-resolution_single-cell_atlas.extended.h5ad')
ad

In [None]:
ad.obs.cell_type.value_counts()

In [None]:
ad = ad[ad.obs.disease!= 'normal']
ad

In [None]:
ad = filter_and_recompute(ad, 
                          celltype_col='cell_type', 
                          celltypes_to_keep=['malignant cell'],
                          further_pre=True)
ad

In [None]:
# sc.pl.umap(ad, color='is_primary_data')
sc.pl.umap(ad, color='disease')

sc.pl.umap(ad, color='tissue')
sc.pl.umap(ad, color='cell_type_tumor')
sc.pl.umap(ad, color='study')

In [None]:
ad.obs['study'].value_counts()

In [None]:
ad.obs['uicc_stage'].value_counts()

In [None]:
ad[ad.obs.uicc_stage == 'III'].obs['study'].value_counts()

In [None]:
ad = ad[ad.obs.study != 'Wu_Zhou_2021']

In [None]:
primary_or_metastatic = []

for _, row in ad.obs.iterrows():
    if row['tissue'] != 'lung':
        primary_or_metastatic.append('Metastatic')
    elif 'IV' in str(row['uicc_stage']):
        primary_or_metastatic.append('Metastatic')
    elif row['uicc_stage'] == 'III':
        primary_or_metastatic.append('Locally advanced')
    else:
        primary_or_metastatic.append('Primary')

ad.obs['Primary_or_Metastatic'] = primary_or_metastatic


In [None]:
def assign_lung_subtype(row):
    if row["EGFR_mutation"] == "mutated":
        return "LUCA: EGFR-mutant"
    elif row["KRAS_mutation"] == "mutated":
        return "LUCA: KRAS-mutant"
    elif row["ALK_mutation"] == "mutated":
        return "LUCA: ALK-rearranged"
    elif row["ROS_mutation"] == "mutated":
        return "LUCA: ROS1-rearranged"
    elif row["BRAF_mutation"] == "mutated":
        return "LUCA: BRAF-mutant"
    elif row["ERBB2_mutation"] == "mutated":
        return "LUCA: ERBB2-mutant"
    elif row["TP53_mutation"] == "mutated":
        return "LUCA: TP53-mutant (no driver)"
    else:
        return "LUCA: Unspecified"

In [None]:
ad.obs['Project_ID'] = ad.obs['study']

ad.obs['Final_cancer_type']  = 'Lung Cancer'
ad.obs['Final_histological_subtype'] = [i[2] for i in ad.obs['cell_type_tumor'].str.split(' ')]
ad.obs['Final_molecular_subtype'] = ad.obs.apply(assign_lung_subtype, axis=1)
ad.obs['Final_tissue'] = ad.obs['tissue'].astype(str).str.capitalize()
ad.obs['Final_sample_id'] = ad.obs.donor_id

In [None]:
ad.obs['development_stage']

In [None]:
import numpy as np

def parse_age(val):
    val = str(val).strip().lower()
    if val == "unknown" or val == "":
        return np.nan
    elif "-" in val:
        try:
            return int(val.split("-")[0])
        except:
            return np.nan
    else:
        try:
            return int(val)
        except:
            return np.nan

ad.obs['Final_patient_age'] = ad.obs['development_stage'].apply(parse_age)


In [None]:
# add more clinical information
ad.obs['Final_patient_stage'] = ad.obs['uicc_stage']
# ad.obs['Final_patient_treatment'] = ad.obs['Treatment most recent class']+' '+ad.obs['Treatment status']

In [None]:
ad.obs['Project_ID'].value_counts()

In [None]:
# add more clinical information

ad.obs['Final_patient_treatment'] = 'Naïve'

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/High-resolution_single-cell_atlas.LUCA.h5ad', compression='gzip')

## Signatures of plasticity, metastasis, and immunosuppression in an atlas of human small cell lung cancer

Paper: https://www.sciencedirect.com/science/article/pii/S1535610821004979?via%3Dihub

Data downloaded from: https://cellxgene.cziscience.com/collections/62e8f058-9c37-48bc-9200-e767f318a8ec

Link: 
- Anndata: https://datasets.cellxgene.cziscience.com/b9175586-5875-4346-afa2-5f23e15fe16d.h5ad

In [None]:
ad = sc.read_h5ad('./Data/LUAD/HTAN_MSK_SCLC.h5ad')

In [None]:
ad

In [None]:
# re-process the adata
ad = reprocess_from_raw_layer(ad, 
                              Project_ID='HTAN_MSK_SCLC', 
                              Primary_or_Metastatic='Primary',
                              further_pre=False)

In [None]:
for obs in ['batch', 'cell_type_coarse', 'cell_type_fine', 'cell_type_general']:
    sc.pl.umap(ad, color=obs)

In [None]:
ad = filter_and_recompute(ad, 
                          celltype_col='cell_type_fine', 
                          celltypes_to_keep=['SCLC-A', 'SCLC-N', 'SCLC-P'],
                          further_pre=True)
ad

In [None]:
for obs in ['batch', 'disease', 'histo']:
    sc.pl.umap(ad, color=obs)

In [None]:
patient_clinical_df = pd.read_csv('./Data/LUAD/HTAN_MSK_SCLC.clinical.txt', sep='\t')
# patient_clinical_df = patient_clinical_df[patient_clinical_df['Profiling method'] == 'scRNAseq']
patient_clinical_df['Lab ID'] = patient_clinical_df['Lab ID'].str.upper()
patient_clinical_df.head()

In [None]:
ad.obs['donor_id'] = ad.obs['donor_id'].str.upper()

In [None]:
merged = ad.obs.reset_index().merge(
    patient_clinical_df,
    left_on='donor_id',
    right_on='Lab ID',
    how='left'
)

# Step 2: Restore original index (cell barcodes)
merged = merged.set_index('Cell')

# Step 3: Assign back
ad.obs = merged
ad

In [None]:
ad.obs[ad.obs.tissue == 'lung']['Stage at Dx'].value_counts()

In [None]:
primary_or_metastatic = []
for i in ad.obs.tissue:
    if i != 'lung':
        primary_or_metastatic.append('Metastatic')
    else:
        primary_or_metastatic.append('Primary')
ad.obs['Primary_or_Metastatic'] = primary_or_metastatic

ad.obs['Project_ID'] = 'HTAN_MSK_SCLC'

ad.obs['Final_cancer_type']  = 'Lung Cancer'
ad.obs['Final_histological_subtype'] = 'SCLC'
ad.obs['Final_molecular_subtype'] = 'LUCA: Unspecified'
ad.obs['Final_tissue'] = ad.obs['tissue'].astype(str).str.capitalize()
ad.obs['Final_sample_id'] = ad.obs.donor_id

ad

In [None]:
def parse_age(val):
    val = str(val).strip().lower()
    if val == "unknown" or val == "":
        return np.nan
    elif "-" in val:
        try:
            return int(val.split("-")[0])
        except:
            return np.nan
    else:
        try:
            return int(val)
        except:
            return np.nan

ad.obs['Final_patient_age'] = ad.obs['development_stage'].apply(parse_age)


In [None]:
# add more clinical information
# ad.obs['Final_patient_age'] = 'Unknown'
ad.obs['Final_patient_stage'] = ad.obs['Stage at Dx']
ad.obs['Final_patient_treatment'] = ad.obs['treatment']

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/Signature_atlas_SCLC.LUCA.h5ad', compression='gzip')

In [None]:
ad

## A pan-cancer blueprint of the heterogeneous tumor microenvironment revealed by single-cell profiling


Paper: https://www.nature.com/articles/s41422-020-0355-0#Fig3

Data downloaded from: https://lambrechtslab.sites.vib.be/en/pan-cancer-blueprint-tumour-microenvironment-0

Link: 
- Matrix: https://lambrechtslab.sites.vib.be/en/pan-cancer-blueprint-tumour-microenvironment-0 (Lung cancer - Counts Matrix)
- Patient metadata: https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-020-0355-0/MediaObjects/41422_2020_355_MOESM13_ESM.pdf
- Sequecing quality: https://www.nature.com/
https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-020-0355-0/MediaObjects/41422_2020_355_MOESM14_ESM.pdf

In [None]:
# Set the project directory
project_dir = "./Data/LUAD/2096-Lungcancer/"  # <- change this for each project

# Load and preprocess
ad = load_and_preprocess_project(project_dir, 
                                 Project_ID='2096-Lungcancer', 
                                 Primary_or_Metastatic='Primary',
                                 further_pre=True)

In [None]:
ad.raw.shape

In [None]:
ad.obs['PatientNumber'] = ad.obs['PatientNumber'].astype(str)

In [None]:
sc.pl.umap(ad, color=['CellType'])
sc.pl.umap(ad, color=['PatientNumber'])
# sc.pl.umap(ad, color=['CellFromTumor'])
# sc.pl.umap(ad, color=['TumorSite'])
# sc.pl.umap(ad, color=['Project'])


In [None]:
ad = filter_and_recompute(adata=ad, 
                          celltype_col='CellType', 
                          celltypes_to_keep=['Cancer'],
                          further_pre=True)
ad

In [None]:
sc.pl.umap(ad, color=['TumorType', 'PatientNumber'])


In [None]:
patient_cell_number = pd.read_csv("./Data/LUAD/2096-Lungcancer/2097-Lungcancer_metadata.csv")['PatientNumber'].value_counts()
patient_cell_number = patient_cell_number.to_dict()
patient_cell_number

In [None]:
pd.read_csv('./Data/BRCA/2102-Breastcancer/Sequencing_quality_S2.txt', sep='\t')['Cancer type'].value_counts()

In [None]:
patient_seuqncing_meta_df = pd.read_csv('./Data/BRCA/2102-Breastcancer/Sequencing_quality_S2.txt', sep='\t')
patient_seuqncing_meta_df = patient_seuqncing_meta_df[patient_seuqncing_meta_df['Cancer type'] == 'LC']
patient_seuqncing_meta_df

In [None]:
# Group by 'Patient number' and sum the 'Cells' column
cells_to_lc_label = patient_seuqncing_meta_df.groupby("Patient number")["Cells"].sum().to_dict()

# Flip the dict so it's {cell_sum: patient_id}
cells_to_lc_label = {v: k for k, v in cells_to_lc_label.items()}
cells_to_lc_label


In [None]:
patient_number_to_LC_id = dict()
for patient_number in patient_cell_number.keys():
    # print(patient_number)
    try:
        patient_number_to_LC_id[str(patient_number)] = cells_to_lc_label[patient_cell_number[patient_number]]
    except:
        print(patient_number)
patient_number_to_LC_id

In [None]:
patient_number_to_LC_id['8'] = 'LC_8'

In [None]:
ad.obs['BC_PatientID'] = ad.obs['PatientNumber'].map(patient_number_to_LC_id)
ad.obs

In [None]:
patient_meta_df = pd.read_csv('./Data/BRCA/2102-Breastcancer/Patient_metadata_S1.txt', sep='\t')
patient_meta_df = patient_meta_df[patient_meta_df['Tumor_type'] == 'LC']
meta_subset = patient_meta_df
meta_subset

In [None]:
ad.obs = ad.obs.merge(meta_subset, left_on='BC_PatientID', right_on='Patient_number', how='left')
ad.obs

In [None]:
ad

In [None]:
ad.obs = ad.obs.drop(columns=['Molecular_status'])


In [None]:
ad.obs['Final_cancer_type'] = 'Lung Cancer'
ad.obs['Final_histological_subtype'] = ad.obs.Pathological_subtype
ad.obs['Final_molecular_subtype'] = 'LUCA: Unspecified'
ad.obs['Final_tissue'] = 'Lung'
ad.obs['Final_sample_id'] = ad.obs['BC_PatientID']

In [None]:
ad

In [None]:
# add more clinical information
ad.obs['Final_patient_age'] = ad.obs['Age_range']
ad.obs['Final_patient_stage'] = ad.obs['TNM']
ad.obs['Final_patient_treatment'] = 'Naïve'

In [None]:
ad.raw.shape

In [None]:
ad

In [None]:
ad.write_h5ad('./Data/Cancer_cell_data/2096-Lungcancer.LUCA.h5ad', compression='gzip')

## Integrate the data

In [None]:
data_dir = './Data/Cancer_cell_data/'
all_h5_files = os.listdir(data_dir)
all_h5_files.sort()

all_h5_files

In [None]:
from collections import defaultdict

cancer_ad_list = []

for h5 in all_h5_files:
    if 'ntegrated' in h5 or 'LUCA' not in h5:
        continue

    print(h5)
    # continue
    ad = sc.read_h5ad(data_dir + h5)
    
    # display(tmp_ad.obs)
    if '2096-Lungcancer' in h5:
        ad.obs_names = ad.obs['Cell']
    elif 'resolution_single' in h5:
        ad.obs_names = ad.obs.index
    elif '_SCLC' in h5:
        ad.obs_names = ad.obs.index

    # Fix .var_names
    if ad.var_names[0].startswith('ENSG'):
        new_names = [i.split('_')[0] for i in ad.var.feature_name]
    elif 'ENSG' in ad.var_names[0]:
        new_names = [i.split('_')[0] for i in ad.var_names]
    else:
        new_names = list(ad.var_names)

    # Assign new names
    ad.var_names = new_names
    ad.var_names_make_unique()

    # Fix raw.var names
    if ad.raw is not None:
        ad.raw._var.index = pd.Index(new_names).astype(str)
        # Ensure uniqueness
        seen = defaultdict(int)
        unique_names = []
        for name in ad.raw._var.index:
            if seen[name]:
                unique_names.append(f"{name}_{seen[name]}")
            else:
                unique_names.append(name)
            seen[name] += 1
        ad.raw._var.index = pd.Index(unique_names)

    # Clean obs + var
    # ad.obs = ad.obs.reset_index(drop=True)
    ad.obs_names_make_unique()
    ad.var_names_make_unique()

    cancer_ad_list.append(ad)
    display(ad.to_df())
    display(ad.raw.to_adata().to_df())
    print(ad.raw.to_adata().to_df().max(axis=1))


In [None]:
for ad in cancer_ad_list:
    print(ad.raw.shape)

In [None]:
memory_usgae()

In [None]:
combined_ad = ann.concat(cancer_ad_list, join="inner", axis=0)
combined_ad

In [None]:
combined_ad.raw.shape

In [None]:
combined_ad = reprocess_all(combined_ad)

In [None]:
combined_ad

In [None]:
combined_ad.obs["Final_histological_subtype"].value_counts()

In [None]:
combined_ad.obs["Final_histological_subtype_backup"] = combined_ad.obs["Final_histological_subtype"].copy()


In [None]:
def unify_histological_subtype(value):
    value = str(value).strip().lower()
    if value in {"luad", "adenocarcinoma"}:
        return "LUCA: Lung adenocarcinoma"
    elif value in {"lusc", "squamous cell carcinoma"}:
        return "LUCA: Lung squamous carcinoma"
    elif value == "nsclc":
        return "LUCA: NSCLC"
    elif value == "sclc":
        return "LUCA: Small cell lung cancer"
    elif value == "large cell carcinoma":
        return "LUCA: Large cell carcinoma"
    elif value == "pleiomorphic carcinoma":
        return "LUCA: Pleiomorphic carcinoma"
    else:
        return "LUCA: Unspecified"

combined_ad.obs["Final_histological_subtype"] = combined_ad.obs["Final_histological_subtype_backup"].apply(unify_histological_subtype)
combined_ad.obs['Final_histological_subtype'].value_counts()

In [None]:
combined_ad.obs['Final_molecular_subtype'].value_counts()

In [None]:
def unify_tissue(tissue):
    tissue = str(tissue).strip().lower()
    if "adrenal" in tissue:
        return "Adrenal"
    # add more rules here as needed
    return tissue.capitalize()

combined_ad.obs["Final_tissue"] = combined_ad.obs["Final_tissue"].apply(unify_tissue)

In [None]:
combined_ad

In [None]:
for obs in ['Project_ID', 'Primary_or_Metastatic', 'Final_cancer_type', 'Final_histological_subtype', 'Final_molecular_subtype', 'Final_tissue']:
    sc.pl.umap(combined_ad, color=obs)

### Harmony integration

In [None]:
combined_ad

In [None]:
Z = harmonize(combined_ad.obsm['X_pca'], combined_ad.obs, batch_key = ['Project_ID'])


In [None]:
combined_ad.obsm['X_pca_harmony'] = Z


In [None]:
sc.pp.neighbors(combined_ad, n_neighbors=15, use_rep='X_pca_harmony')
sc.tl.umap(combined_ad)

In [None]:
for obs in ['Project_ID', 'Primary_or_Metastatic', 'Final_cancer_type', 'Final_histological_subtype', 'Final_molecular_subtype', 'Final_tissue']:
    sc.pl.umap(combined_ad, color=obs)

In [None]:
combined_ad.obs["Final_patient_age_backup"] = combined_ad.obs["Final_patient_age"]

def clean_patient_age(age):
    if pd.isna(age):
        return np.nan
    age = str(age).strip()
    if age.lower() == "unknown":
        return np.nan
    elif "-" in age:
        # Convert age ranges like '46-50' to their midpoint
        parts = age.split("-")
        try:
            return int((int(parts[0]) + int(parts[1])) / 2)
        except:
            return np.nan
    else:
        try:
            return int(age)
        except:
            return np.nan

# Apply cleaning
combined_ad.obs["Final_patient_age"] = combined_ad.obs["Final_patient_age_backup"].apply(clean_patient_age)
combined_ad.obs["Final_patient_age_backup"] =combined_ad.obs["Final_patient_age_backup"].astype(str)

In [None]:

combined_ad.write_h5ad('./Data/Cancer_cell_data/LUCA_integrated.harmony.h5ad', compression='gzip')
