In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
datasets = "/content/drive/MyDrive/Datasets/"
breast_atlas = f"{datasets}Breast_Atlas_ST_8/ST_8.h5ad"
cell2location = "output_Cell2Location.csv"
tnbc = f"{datasets}Breast_Cancer_TNBC/TNBC_CID44971_ST.h5ad"

In [None]:
!pip install anndata
!pip install scanpy



In [None]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix
print(ad.__version__)

0.12.1


In [None]:
# Load the .h5ad file
df_breast_atlas = sc.read_h5ad(breast_atlas)
df_tnbc = sc.read_h5ad(tnbc)

In [None]:
df_breast_atlas

AnnData object with n_obs × n_vars = 2801 × 36503
    obs: 'mapped_reference_assembly', 'mapped_reference_annotation', 'alignment_software', 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'donor_menopausal_status', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_derivation_process', 'sample_source', 'donor_BMI_at_collection', 'tissue_section_uuid', 'tissue_section_thickness', 'library_uuid', 'assay_ontology_term_id', 'sequencing_platform', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'nCount_Spatial', 'nFeature_Spatial', 'nCount_SCT', 'nFeature_SCT', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_

In [None]:
df_tnbc

AnnData object with n_obs × n_vars = 1162 × 19237
    obs: 'new_x', 'new_y', 'pixel_x', 'pixel_y'
    uns: 'spatial'
    obsm: 'spatial'

In [None]:
df_breast_copy = df_breast_atlas.copy()
df_tnbc_copy = df_tnbc.copy()

In [None]:
df_breast_copy

AnnData object with n_obs × n_vars = 2801 × 36503
    obs: 'mapped_reference_assembly', 'mapped_reference_annotation', 'alignment_software', 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'donor_menopausal_status', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_derivation_process', 'sample_source', 'donor_BMI_at_collection', 'tissue_section_uuid', 'tissue_section_thickness', 'library_uuid', 'assay_ontology_term_id', 'sequencing_platform', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'nCount_Spatial', 'nFeature_Spatial', 'nCount_SCT', 'nFeature_SCT', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_

In [None]:
df_tnbc_copy

AnnData object with n_obs × n_vars = 1162 × 19237
    obs: 'new_x', 'new_y', 'pixel_x', 'pixel_y'
    uns: 'spatial'
    obsm: 'spatial'

In [None]:
df_breast_copy.obs = pd.DataFrame(index=df_breast_copy.obs_names)
df_tnbc_copy.obs = pd.DataFrame(index=df_tnbc_copy.obs_names)

In [None]:
# Step 1: Set the index to "feature_name"
df_breast_copy.var.index = df_breast_copy.var["feature_name"].values

# Step 2: Remove all other columns, keeping only the index
df_breast_copy.var = pd.DataFrame(index=df_breast_copy.var.index)

AnnData expects .var.index to contain strings, but got values like:
    ['MIR1302-2HG', 'FAM138A', 'OR4F5', 'RP11-34P13.7', 'RP11-34P13.8']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


In [None]:
df_breast_copy.uns = {}
df_breast_copy.obsm = {}
df_tnbc_copy.uns = {}
df_tnbc_copy.obsm = {}

In [None]:
# Step 1: Find common gene names
common_genes = df_breast_copy.var_names.intersection(df_tnbc_copy.var_names)

# Step 2: Sort common genes (optional but recommended for consistent order)
common_genes = sorted(common_genes)

# Step 3: Subset both AnnData objects
df_breast_copy = df_breast_copy[:, common_genes].copy()
df_tnbc_copy = df_tnbc_copy[:, common_genes].copy()


In [None]:
# Add custom labels to adata1
df_breast_copy.obs["batch"] = "0"
df_breast_copy.obs["condition 1"] = "healthy"
df_breast_copy.obs["annos"] = "CellType"

# Add custom labels to adata2
df_tnbc_copy.obs["batch"] = "1"
df_tnbc_copy.obs["condition 1"] = "diseased"
df_tnbc_copy.obs["annos"] = "CellType"


In [None]:
# Now concatenate without using the `label` argument
df_combined = ad.concat([df_breast_copy, df_tnbc_copy], join="inner", axis=0)

  df_combined = ad.concat([df_breast_copy, df_tnbc_copy], join="inner", axis=0)


In [None]:
df_combined

AnnData object with n_obs × n_vars = 3963 × 16498
    obs: 'batch', 'condition 1', 'annos'

In [None]:
!pip install scikit-misc



In [None]:
sc.pp.highly_variable_genes(
    df_combined,
    flavor="seurat_v3",      # or 'cell_ranger' / 'seurat'
    n_top_genes=2000,        # you can change this as needed
    batch_key="batch"        # important if your data has batch effects
)

# Step 3: Filter to HVGs if desired
adata_hvg = df_combined[:, df_combined.var['highly_variable']].copy()

# Optional: Check result
print("Number of HVGs:", adata_hvg.shape[1])

Number of HVGs: 2000


In [None]:
adata_hvg.obs

Unnamed: 0,batch,condition 1,annos
AAACAGCTTTCAGAAG-1_10,0,healthy,CellType
AAACAGGGTCTATATT-1_10,0,healthy,CellType
AAACAGTGTTCCTGGG-1_10,0,healthy,CellType
AAACATGGTGAGAGGA-1_10,0,healthy,CellType
AAACATTTCCCGGATT-1_10,0,healthy,CellType
...,...,...,...
TGCAGTGGTAGGGAAC-1,1,diseased,CellType
AGCGAGACGTGAAGGC-1,1,diseased,CellType
CAGTGTTAATCTCTCA-1,1,diseased,CellType
GATCGCTGTGGTGCGT-1,1,diseased,CellType


In [None]:
# Store adata_hvg.obs as csv
adata_hvg.obs.to_csv('/content/drive/MyDrive/Datasets/scDisInFact/meta_cells_with_annos.csv')

In [None]:
adata_hvg.write('/content/drive/MyDrive/Datasets/scDisInFact/breast_hvgs.h5ad', compression="gzip")

In [None]:
!h5ls '/content/drive/MyDrive/Datasets/scDisInFact/breast_hvgs.h5ad'

/bin/bash: line 1: h5ls: command not found
