-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Description of feature
implement steps for QC metrics plot using this code from @SaraTerzol
# Quality control
def qc_from_h5ad(
h5ad_path,
sample_id,
plots_dir="/home/jovyan/work/results/QC_plots",
output_dir="/home/jovyan/work/results/object/H5AD",
min_counts=100,
min_genes=50,
max_mt=20,
novelty_thresh=0.8,
min_cells=3,
pdf_pages=None # optional PdfPages object to save all plots in a single PDF
):
"""
Perform full QC starting from a 016um H5AD file, including:
- In-tissue filtering
- Genes/UMI per spot
- Mitochondrial, ribosomal, hemoglobin gene QC
- Combined QC plots
- Local outlier detection
- Save filtered AnnData
Parameters
----------
h5ad_path : str
Path to the 016um H5AD file.
sample_id : str
Sample identifier.
plots_dir : str
Directory where QC plots will be saved.
output_dir : str
Directory where the filtered AnnData will be saved.
pdf_pages : PdfPages, optional
Pass an open PdfPages object to save all plots in a single PDF.
Returns
-------
dict
Summary statistics: total spots, in-tissue spots, genes/UMI per spot.
"""
# ------------------------
# Create directories if needed
# ------------------------
os.makedirs(plots_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
sns.set(style="whitegrid")
# ------------------------
# Start message
# ------------------------
print(f"\n=== Processing sample: {sample_id} ===")
# ------------------------
# Load 016um AnnData
# ------------------------
adata = sc.read_h5ad(h5ad_path)
adata.obs['sample'] = sample_id
# ------------------------
# Count spots
# ------------------------
total_spots = len(adata.obs['in_tissue'])
n_in_tissue = (adata.obs['in_tissue'] == 1).sum()
n_out_tissue = (adata.obs['in_tissue'] == 0).sum()
print(f"Total spots: {total_spots}, In tissue: {n_in_tissue}, Outside tissue: {n_out_tissue}")
# ------------------------
# Filter in-tissue spots
# ------------------------
adata.obs["in_tissue_str"] = ["In tissue" if x==1 else "Outside tissue" for x in adata.obs["in_tissue"]]
adata = adata[adata.obs["in_tissue_str"]=="In tissue"].copy()
if not sp.issparse(adata.X):
adata.X = sp.csr_matrix(adata.X)
# ------------------------
# Compute genes and UMI per spot
# ------------------------
genes_per_spot = np.array((adata.X > 0).sum(axis=1)).flatten()
umi_per_spot = np.array(adata.X.sum(axis=1)).flatten()
mean_genes_per_spot = genes_per_spot.mean()
mean_umi_per_spot = umi_per_spot.mean()
print(f"Mean genes per spot: {mean_genes_per_spot:.2f}")
print(f"Mean UMI per spot: {mean_umi_per_spot:.2f}")
# ------------------------
# Flag genes for QC
# ------------------------
adata.var["mt"] = adata.var_names.str.startswith("MT-")
adata.var["ribo"] = adata.var_names.str.startswith(('RPS','RPL'))
adata.var["hb"] = adata.var_names.str.startswith("HB")
# ------------------------
# Calculate QC metrics
# ------------------------
sc.pp.calculate_qc_metrics(
adata,
qc_vars=['mt','ribo','hb'],
inplace=True,
percent_top=[20],
log1p=True
)
print(adata.obs)
# ------------------------
# Compute novelty (complexity) score
# ------------------------
# Novelty score
novelty_score = genes_per_spot / umi_per_spot
adata.obs["novelty_score"] = novelty_score
adata.obs["low_complexity"] = novelty_score < novelty_thresh
num_low_complexity = adata.obs["low_complexity"].sum()
print(f"Num_low_complexity: {num_low_complexity}")
# QC flags for global outliers
adata.obs["qc_lib_size"] = adata.obs["total_counts"] < min_counts
adata.obs["qc_detected"] = adata.obs["n_genes_by_counts"] < min_genes
if "pct_counts_mt" in adata.obs.columns:
adata.obs["qc_mito"] = adata.obs["pct_counts_mt"] > max_mt
else:
adata.obs["qc_mito"] = False
# ------------------------
# MT table
# ------------------------
if "pct_counts_mt" in adata.obs.columns:
mt_table = pd.DataFrame({
"Sample": [sample_id],
"Total spots": [total_spots],
"Mean % MT": [adata.obs["pct_counts_mt"].mean()],
"Median % MT": [adata.obs["pct_counts_mt"].median()],
"Max % MT": [adata.obs["pct_counts_mt"].max()],
"Min % MT": [adata.obs["pct_counts_mt"].min()]
})
print("\n--- Mitochondrial counts summary ---")
print(mt_table)
# ------------------------
# QC plots before filtering
# ------------------------
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(adata.obs["total_counts"], kde=True, color="skyblue", ax=axs[0])
axs[0].set_title(f"{sample_id} - Total Counts per Spot")
axs[0].set_xlabel("Total counts")
axs[0].set_ylabel("Number of spots")
sns.histplot(adata.obs["n_genes_by_counts"], kde=True, bins=60, color="lightgreen", ax=axs[1])
axs[1].set_title(f"{sample_id} - Genes Detected per Spot")
axs[1].set_xlabel("Number of genes")
axs[1].set_ylabel("Number of spots")
if "pct_counts_mt" in adata.obs.columns:
sns.histplot(adata.obs["pct_counts_mt"], kde=True, bins=60, color="salmon", ax=axs[2])
axs[2].set_title(f"{sample_id} - Percent Mitochondrial Counts")
axs[2].set_xlabel("% mt counts")
axs[2].set_ylabel("Number of spots")
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, f"{sample_id}_qc_before_filter.png"))
if pdf_pages:
pdf_pages.savefig(fig)
plt.close(fig)
# ------------------------
# Global outlier detection
# ------------------------
# Combine global outliers
adata.obs["global_outliers"] = (
adata.obs["qc_lib_size"] |
adata.obs["qc_detected"] |
adata.obs["qc_mito"]
#adata.obs["low_complexity"]
)
print("Number of global outliers:")
print(adata.obs["global_outliers"].value_counts())
# ------------------------
# Local outlier detection
# ------------------------
for metric in ["total_counts", "n_genes_by_counts","pct_counts_mt"]:
lo.local_outliers(adata, metric=metric, sample_key="region",n_neighbors=36)
outlier_col = f"{metric}_outliers"
outliers_count = adata.obs[outlier_col].sum()
print(f"Detected {outliers_count} local outliers for {metric}.")
# Combine local outliers
adata.obs["local_outliers"] = (
adata.obs["total_counts_outliers"] |
adata.obs["n_genes_by_counts_outliers"] |
adata .obs["pct_counts_mt_outliers"]
)
print("Number of local outliers:")
print(adata.obs["local_outliers"].value_counts())
# ------------------------
# Spatial QC plot
# ------------------------
plt.rcParams["figure.figsize"] = (8, 8)
sc.pl.spatial(
adata,
img_key="hires",
color=["total_counts", "n_genes_by_counts","pct_counts_mt","global_outliers","total_counts_outliers","n_genes_by_counts_outliers","pct_counts_mt_outliers","local_outliers"],
spot_size=16,
cmap='viridis',
alpha=0.8,
show=False
)
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, f"{sample_id}_spatial_qc.png"))
plt.close()
# ------------------------
# Collect QC summary
# ------------------------
summary_dict = {
"Sample": sample_id,
"Total spots": total_spots,
"In-tissue spots": n_in_tissue,
"Outside-tissue spots": n_out_tissue,
"Mean genes per spot": mean_genes_per_spot,
"Mean UMI per spot": mean_umi_per_spot,
"Global outliers (True)": int(adata.obs["global_outliers"].sum()),
"Global outliers (False)": int((~adata.obs["global_outliers"]).sum()),
"Local outliers (True)": int(adata.obs["local_outliers"].sum()),
"Local outliers (False)": int((~adata.obs["local_outliers"]).sum()),
}
# Add mitochondrial summary if available
if "pct_counts_mt" in adata.obs.columns:
summary_dict.update({
"Mean % MT": adata.obs["pct_counts_mt"].mean(),
"Median % MT": adata.obs["pct_counts_mt"].median()
})
# ------------------------
# Save summary as CSV
# ------------------------
summary_df = pd.DataFrame([summary_dict])
csv_path = os.path.join(plots_dir, f"{sample_id}_QC_summary.csv")
summary_df.to_csv(csv_path, index=False)
print(f"\nQC summary saved to: {csv_path}")
# ------------------------
# Save filtered AnnData
# ------------------------
output_h5ad = os.path.join(output_dir, f"{sample_id}_016um_qc.h5ad")
adata.write(output_h5ad)
print(f"Filtered AnnData saved to: {output_h5ad}")
# ------------------------
# Return summary statistics
# ------------------------
return {
"sample_id": sample_id,
"total_spots": total_spots,
"spots_in_tissue": n_in_tissue,
"spots_out_tissue": n_out_tissue,
"mean_genes_per_spot": mean_genes_per_spot,
"mean_umi_per_spot": mean_umi_per_spot
}
Call functions
Load the AnnData object and initialize the SpatialData object with image and scale factors.
samples = [{"path": "/home/jovyan/data/2_Analysis/0_spaceranger/NFG012_018_CyLIB", "id": "SW03"}]
processed_samples = {}
for sample in samples:
sample_path = sample["path"]
sample_id = sample["id"]
zarr_path, h5ad_path = load_and_save(
NFG_path=sample_path,
sample_id=sample_id,
output_dir="/home/jovyan/work/results/object/"
)
processed_samples[sample_id] = {"zarr": zarr_path, "h5ad": h5ad_path}
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request