In [5]:
import utils
import anndata
import numpy as np
import pandas as pd
import requests
import scanpy as sc
import scanpy.external as sce
import scanorama
import os
import anndata
from typing import List, Optional

In [6]:
path = os.getcwd()[:-3]

In [7]:
processed_612 = sc.read_h5ad(path+'data/processed_612_data.h5ad')
processed_613 = sc.read_h5ad(path+'data/processed_613_data.h5ad')
processed_352 = sc.read_h5ad(path+'data/processed_352_data.h5ad')

In [8]:
processed_data = [processed_352, processed_612, processed_613]

In [9]:
def gene_overlap_check(adata_list: list[sc.AnnData]):
    """Quick check of gene overlap between datasets"""
    gene_sets = [set(adata.var.index) for adata in adata_list]
    common_genes = set.intersection(*gene_sets)
    all_genes = set.union(*gene_sets)

    print(f"Gene overlap summary:")
    print(f"  Genes common to ALL datasets: {len(common_genes)}")
    print(f"  Total unique genes: {len(all_genes)}")
    print(f"  Overlap percentage: {len(common_genes) / len(all_genes) * 100:.1f}%")
    return common_genes

def merge_anndata_on_shared_vars(
        adata_list: List[sc.AnnData],
        output_path: Optional[str] = None
) -> sc.AnnData:
    """
    Merge multiple AnnData objects on shared var indices (genes).
    Assumes each dataset already has a 'dataset' column in .obs.

    Parameters:
    -----------
    adata_list : List[sc.AnnData]
        List of AnnData objects to merge
    output_path : Optional[str]
        Path to save the merged object as h5ad file

    Returns:
    --------
    sc.AnnData
        Merged AnnData object with only shared genes
    """

    # Validate inputs
    if len(adata_list) < 2:
        raise ValueError("Need at least 2 AnnData objects to merge")

    print(f"Merging {len(adata_list)} AnnData objects on shared variables...")

    # Print basic info about each dataset
    for i, adata in enumerate(adata_list):
        dataset_info = adata.obs['dataset'].value_counts()
        print(f"Dataset {i}: {adata.n_obs} cells × {adata.n_vars} genes")
        print(f"  Dataset labels: {list(dataset_info.index)}")

    # Merge datasets using inner join (only shared genes)
    merged_adata = sc.concat(
        adata_list,
        join='inner',  # Only keep genes present in ALL datasets
        index_unique=None
    )

    print(f"\nMerged dataset: {merged_adata.n_obs} cells × {merged_adata.n_vars} genes")
    print(f"Genes kept: {merged_adata.n_vars} (shared across all datasets)")

    # Show dataset distribution in merged object
    print(f"\nDataset distribution in merged object:")
    dataset_counts = merged_adata.obs['dataset'].value_counts()
    print(dataset_counts)

    # Save if output path provided
    if output_path:
        merged_adata.write(output_path)
        print(f"\nSaved merged dataset to: {output_path}")

    return merged_adata

In [23]:
gene_overlap_check(processed_data)

Gene overlap summary:
  Genes common to ALL datasets: 21543
  Total unique genes: 25639
  Overlap percentage: 84.0%


{'Krtcap3',
 'Pkd1',
 'Rad52',
 '1700028M03Rik',
 'Mtx1',
 'Angptl6',
 'Ptcd3',
 'Gm47814',
 'Suds3',
 'Zfp771',
 'Rfc3',
 '4932412D23Rik',
 'Pcmtd1',
 'Mbd3l2',
 'Endov',
 'Sh2d1a',
 'Rnaset2a',
 'Spocd1',
 'Gabarap',
 'Kctd8',
 'Dpp7',
 'Naa30',
 'Fcor',
 'Gm6741',
 'Usp27x',
 'B230104I21Rik',
 '4933412O06Rik',
 'Gm38604',
 'Fbxo18',
 'Smtn',
 'Rab10',
 'Lsm5',
 'Gatsl3',
 'Angptl7',
 'Myh4',
 'Magee2',
 'Gm47990',
 'BC030500',
 'Ccdc157',
 'Ndufs8',
 'Adamts10',
 'Atp11a',
 'Unc5b',
 'Retreg3',
 'Cnnm2',
 'Smo',
 'Jph2',
 'Tmem57',
 'Ank3',
 'AC124561.1',
 'Gm20512',
 'Lrrc8a',
 'Vmn1r188',
 'Dxo',
 'Prrx2',
 'Zfp202',
 'Cadps',
 'Chia1',
 'Zpbp',
 'Gimap6',
 'Gm11872',
 'Gm30411',
 'Irx5',
 'A230083G16Rik',
 'Rpain',
 'Dclk1',
 'F9',
 'Haus7',
 'AA986860',
 'Tom1',
 'Slc31a2',
 'Mvb12a',
 'Cftr',
 'Gm19412',
 'C030018K13Rik',
 'Zfp629',
 'Pcgf3',
 'Klc2',
 'Btf3',
 'Dmrta2',
 'Gm33037',
 'Epha8',
 'Gm16240',
 'Drc1',
 'Kif5a',
 'Pla2g1b',
 'Zfp709',
 'Gm12472',
 'Snn',
 'Nbr1',
 'A

In [10]:
merge_anndata_on_shared_vars(processed_data, path+'data/unbatch_corrected_data.h5ad')

Merging 3 AnnData objects on shared variables...
Dataset 0: 17563 cells × 21635 genes
  Dataset labels: ['352']
Dataset 1: 78687 cells × 24643 genes
  Dataset labels: ['612']
Dataset 2: 116403 cells × 25323 genes
  Dataset labels: ['613']


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")



Merged dataset: 212653 cells × 21543 genes
Genes kept: 21543 (shared across all datasets)

Dataset distribution in merged object:
dataset
613    116403
612     78687
352     17563
Name: count, dtype: int64

Saved merged dataset to: C:\Creations\Research\NASASLSTP25\AI4LS\AI4LS\CVAE\data/unbatch_corrected_data.h5ad


AnnData object with n_obs × n_vars = 212653 × 21543
    obs: 'batch', 'n_genes', 'dataset', 'Strain', 'Sex', 'Age at Launch', 'Duration', 'Flight'

In [11]:
unbatched = sc.read_h5ad(path + 'data/unbatch_corrected_data.h5ad')
batched = sc.read_h5ad(path + 'data/corrected_data.h5ad')

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [15]:
print(unbatched.shape)
print(batched.shape)

(212653, 21543)
(212653, 21543)


In [18]:
print(unbatched.X[(212652, 94)])

1.7066735


In [20]:
print(batched.X[(212652, 94)])

0.0
