In [None]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import scanpy as sc
import anndata

load_dotenv()

DATA_PATH = Path(os.getenv('DATA_PATH'))

In [None]:
# Load garcia_rna reference

ref_germcell = sc.read_h5ad(DATA_PATH / 'atlas' / 'processed_files/E-MTAB-10551/human_germcells_reduced.h5ad')
ref_somatic = sc.read_h5ad(DATA_PATH / 'atlas' / 'processed_files/E-MTAB-10551/human_somatic_reduced.h5ad')



In [None]:
ref_somatic.obs["celltype"].value_counts()

In [None]:
ref_germcell.obs["celltype"].value_counts()


In [None]:
ref_combined = sc.read_h5ad(DATA_PATH / 'atlas' / 'processed_files/E-MTAB-10551/human_combined.h5ad')
new_types = pd.read_csv(DATA_PATH / 'garcia_ATAC/celltype_predictions/all_celltypes.csv', index_col = 0)
germcell_types = new_types[new_types['is_germcell']]

# Proportion of cell types comparison

In [None]:
ref_combined.obs["celltype"].value_counts(normalize=True)

In [None]:
new_types['celltype'].value_counts(normalize=True)

# Among germ cells only

In [None]:
ref_combined.obs[ref_combined.obs["celltype"].isin(germcell_types['celltype'])]["celltype"].value_counts(normalize=True).loc[lambda x: x > 0]

In [None]:
germcell_types['celltype'].value_counts(normalize=True)

In [None]:
len(germcell_types) / len(new_types)

# Merge all samples and create umap


In [None]:
# Load environment variables
garcia_path = DATA_PATH / 'garcia_ATAC'
matrix_files = list(garcia_path.glob('*matrix.mtx.gz'))    
# Process each sample
all_samples = []
for matrix_file in matrix_files:
    # Get the prefix before _matrix.mtx.gz
    name = matrix_file.stem.replace('matrix.mtx', '')
    # Return None if name contains "and" to skip these samples
    sample_name = None if "and" in name.lower() else name
    if sample_name is None:
        print(f"Skipping {matrix_file.name} as it contains 'and' in the name")
        continue
        
    print(f"Processing sample {sample_name}...")
    
    sample = sc.read_10x_mtx(garcia_path, prefix = sample_name)
    sample.obs["sample"] = sample_name
    new_types_sample = new_types[new_types["sample"] == sample_name]
    sample.obs["celltype"] = new_types_sample["celltype"]
    all_samples.append(sample)

all_samples_adata = anndata.concat(all_samples, join = "inner")

In [None]:
all_samples_adata = all_samples_adata[all_samples_adata.obs["celltype"].isin(germcell_types["celltype"])]

In [None]:
all_samples_adata

In [None]:
all_samples_adata.X.max()

In [None]:
# Normalize and log transform
sc.pp.normalize_total(all_samples_adata)
sc.pp.log1p(all_samples_adata)

# UMAP
# sc.pp.neighbors(all_samples_adata)
# sc.tl.umap(all_samples_adata)



In [None]:
ref_germcell

In [None]:
sc.pl.umap(ref_germcell, color = "celltype")

In [None]:
# Fit UMAP model to the reference data
from umap import UMAP
from matplotlib import pyplot as plt

# Fit UMAP model using the scVI latent space that was used for the original UMAP
umap_model = UMAP()
umap_model.fit(ref_germcell.obsm['X_scVI'])

# We need to get all_samples_adata into a compatible feature space
# Since ref_germcell used scVI, we might need to:
# 1. Either run scVI on all_samples_adata
# 2. Or use a simpler approach with PCA to get to a similar dimensionality

# For now, let's try with PCA
sc.pp.pca(all_samples_adata, n_comps=ref_germcell.obsm['X_scVI'].shape[1])

# Project onto reference UMAP
projected_umap = umap_model.transform(all_samples_adata.obsm['X_pca'])
all_samples_adata.obsm['X_umap'] = projected_umap

# Plot to compare
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sc.pl.umap(ref_germcell, color="celltype", ax=ax1, show=False, title='Reference')
sc.pl.umap(all_samples_adata, color="celltype", ax=ax2, show=False, title='Projected')
plt.tight_layout()
plt.show()

In [None]:
# Concatenate the datasets
# Make sure they share the same genes first
common_genes = list(set(all_samples_adata.var_names) & set(ref_germcell.var_names))
combined = anndata.concat(
    [all_samples_adata[:, common_genes], ref_germcell[:, common_genes]],
    join='inner',
    label='dataset',  # This will create a new column in .obs called 'dataset'
    keys=['new', 'atlas']
)

# Process the combined data
sc.pp.normalize_total(combined)
sc.pp.log1p(combined)
sc.pp.pca(combined)
sc.pp.neighbors(combined)
sc.tl.umap(combined)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sc.pl.umap(combined, color="dataset", ax=ax1, show=False, title='Datasets')
sc.pl.umap(combined, color="celltype", ax=ax2, show=False, title='Cell Types')
plt.tight_layout()
plt.show()

In [None]:
sc.pp.normalize_total(ref_germcell)
sc.pp.log1p(ref_germcell)
sc.pp.pca(ref_germcell)
sc.pp.neighbors(ref_germcell)
sc.tl.umap(ref_germcell)

In [None]:
from matplotlib import pyplot as plt
gene_intersection = list(set(all_samples_adata.var_names) & set(ref_germcell.var_names))
all_samples_adata = all_samples_adata[:, gene_intersection]
ref_germcell = ref_germcell[:, gene_intersection]
# Ingest the new data into the reference
sc.tl.ingest(all_samples_adata, ref_germcell, obs='celltype')

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sc.pl.umap(ref_germcell, color="celltype", ax=ax1, show=False, title='Reference')
sc.pl.umap(all_samples_adata, color="celltype", ax=ax2, show=False, title='Ingested')
plt.tight_layout()
plt.show()