In [None]:
import pybedtools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from dotenv import load_dotenv
import scanpy as sc

load_dotenv()
DATA_PATH = Path(os.getenv("OUTPUT_PATH"))/'garcia_ATAC'


In [None]:
meiotic_cells_consensus_peak_paths = '/mnt/windows/extradata/meiotic_cells/atac_preprocessing/consensus_regions.bed'

peaks = pd.read_csv(meiotic_cells_consensus_peak_paths, sep='\t', header=None)
# Assuming standard BED3+ format
peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
meiotic_cells_consensus_peaks_bed = pybedtools.BedTool.from_dataframe(peaks)


In [None]:
print(f'total_peaks: {len(meiotic_cells_consensus_peaks_bed)}')
print(f'total_coverage: {sum(peak.length for peak in meiotic_cells_consensus_peaks_bed)}')

In [None]:
garcia_ATAC_consensus_peak_paths = DATA_PATH / 'atac_preprocessing/consensus_peak_calling/consensus_regions.bed'

peaks = pd.read_csv(garcia_ATAC_consensus_peak_paths, sep='\t', header=None)
# Assuming standard BED3+ format
peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
garcia_ATAC_consensus_peaks_bed = pybedtools.BedTool.from_dataframe(peaks)


In [None]:
print(f'total_peaks: {len(garcia_ATAC_consensus_peaks_bed)}')
print(f'total_coverage: {sum(peak.length for peak in garcia_ATAC_consensus_peaks_bed)}')

In [None]:
overlaps = meiotic_cells_consensus_peaks_bed.intersect(garcia_ATAC_consensus_peaks_bed, wo=True)
overlaps.head()

In [None]:
total_meiotic_cells_peak_bases = sum(peak.length for peak in meiotic_cells_consensus_peaks_bed)
total_garcia_ATAC_bases = sum(peak.length for peak in garcia_ATAC_consensus_peaks_bed)
bases_overlapping = sum(int(o.fields[-1]) for o in overlaps) if overlaps else 0
    
print(f"""Peak Base Coverage Statistics:
    
Total bases in meiotic cells peaks:     {total_meiotic_cells_peak_bases:,}
Total bases in Garcia ATAC peaks:       {total_garcia_ATAC_bases:,} 
Overlapping bases between datasets:     {bases_overlapping:,}

Overlap percentages:
- {bases_overlapping/total_meiotic_cells_peak_bases:.1%} of meiotic cells peaks overlap
- {bases_overlapping/total_garcia_ATAC_bases:.1%} of Garcia ATAC peaks overlap
""")

# Clustering comparison

In [None]:
# Load cisTopic objects
import pickle
cistopic_obj_meiotic = pickle.load(open('/mnt/windows/extradata/meiotic_cells/atac_preprocessing/cistopic_obj.pkl', "rb"))

In [None]:
cistopic_obj_garcia = pickle.load(open(DATA_PATH / 'atac_preprocessing/cistopic_obj.pkl', "rb"))

In [None]:
len(cistopic_obj_meiotic.region_names)

In [None]:
len(cistopic_obj_garcia.region_names)

In [None]:
# First, create BED files with cisTopic region names
def create_bed_from_region_names(region_names):
    """Convert cisTopic region names to BED format"""
    regions = []
    for name in region_names:
        chrom, coords = name.split(':')
        start, end = coords.split('-')
        regions.append([chrom, int(start), int(end), name])
    return pybedtools.BedTool(regions)


In [None]:

# Create BED files with correct names
meiotic_regions_bed = create_bed_from_region_names(cistopic_obj_meiotic.region_names)
garcia_regions_bed = create_bed_from_region_names(cistopic_obj_garcia.region_names)


In [None]:

# Merge them to create unified regions
merged_regions = meiotic_regions_bed.cat(garcia_regions_bed, postmerge=False)
merged_regions = merged_regions.sort()
merged_regions = merged_regions.merge()  # preserve names during merge


# Convert to dataframe and add standard names
merged_df = merged_regions.to_dataframe()
merged_df['name'] = merged_df.apply(lambda x: f"{x['chrom']}:{x['start']}-{x['end']}", axis=1)

# Convert back to BedTool if needed
merged_regions = pybedtools.BedTool.from_dataframe(merged_df)


In [None]:
# Print statistics
n_regions1 = len(meiotic_cells_consensus_peaks_bed)
n_regions2 = len(garcia_ATAC_consensus_peaks_bed)
n_merged = len(merged_regions)

print(f"Number of regions in first file: {n_regions1}")
print(f"Number of regions in second file: {n_regions2}")
print(f"Number of regions after merging: {n_merged}")

In [None]:
from scipy import sparse

def calculate_overlap_weights_vectorized(original_regions_bed, unified_regions_bed):
    """Vectorized version of overlap weight calculation"""
    # Get intersections as a dataframe
    overlaps_df = original_regions_bed.intersect(unified_regions_bed, wo=True).to_dataframe()
    
    # Calculate weights using pandas operations
    overlaps_df['weight'] = overlaps_df.iloc[:, -1].astype(float) / (
        overlaps_df.iloc[:, 2] - overlaps_df.iloc[:, 1]
    )
    
    # Create region names
    overlaps_df['orig_region'] = overlaps_df.iloc[:, 3]
    overlaps_df['unified_region'] = overlaps_df.iloc[:, 7]
    
    # Get ALL unified regions (not just those with overlaps)
    all_unified_regions = [f"{r.chrom}:{r.start}-{r.end}" for r in unified_regions_bed]
    
    # Create unique indices for regions
    orig_regions = overlaps_df['orig_region'].unique()
    orig_to_idx = {region: idx for idx, region in enumerate(orig_regions)}
    unified_to_idx = {region: idx for idx, region in enumerate(all_unified_regions)}
    
    # Create sparse matrix of weights
    rows = [orig_to_idx[r] for r in overlaps_df['orig_region']]
    cols = [unified_to_idx[r] for r in overlaps_df['unified_region']]
    weights = overlaps_df['weight'].values
    
    weight_matrix = sparse.csr_matrix(
        (weights, (rows, cols)),
        shape=(len(orig_regions), len(all_unified_regions))
    )
    
    return weight_matrix, orig_regions, all_unified_regions

In [None]:
# Calculate weights using the new BED files
meiotic_weight_matrix, meiotic_orig_regions, unified_regions = calculate_overlap_weights_vectorized(meiotic_regions_bed, merged_regions)
garcia_weight_matrix, garcia_orig_regions, _  = calculate_overlap_weights_vectorized(garcia_regions_bed, merged_regions)

In [None]:
def redistribute_counts_vectorized(matrix, weight_matrix, orig_regions, unified_regions):
    """Vectorized version of count redistribution"""
    print(f"Input matrix shape: {matrix.shape}")
    print(f"Weight matrix shape: {weight_matrix.shape}")
    print(f"Number of original regions: {len(orig_regions)}")
    print(f"Number of unified regions: {len(unified_regions)}")
    
    # Ensure matrix is in CSR format for efficient operations
    matrix = sparse.csr_matrix(matrix)
    weight_matrix = sparse.csr_matrix(weight_matrix)
    
    # Process in chunks to save memory
    chunk_size = 1000
    n_chunks = (matrix.shape[1] + chunk_size - 1) // chunk_size
    result_chunks = []
    
    for i in range(n_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, matrix.shape[1])
        print(f"Processing chunk {i+1}/{n_chunks}")
        
        # Process chunk of the transposed matrix
        chunk = matrix[:, start_idx:end_idx].T
        chunk_result = chunk.dot(weight_matrix)
        result_chunks.append(chunk_result)
    
    # Vertically stack the results
    final_matrix = sparse.vstack(result_chunks)
    
    # Convert to DataFrame
    return final_matrix



In [None]:
# Redistribute counts using vectorized approach
meiotic_unified = redistribute_counts_vectorized(
    cistopic_obj_meiotic.fragment_matrix, 
    meiotic_weight_matrix, 
    meiotic_orig_regions, 
    unified_regions
)


In [None]:

# Clear some memory
del meiotic_weight_matrix
import gc
gc.collect()

print("\nProcessing Garcia ATAC data...")
garcia_unified = redistribute_counts_vectorized(
    cistopic_obj_garcia.fragment_matrix, 
    garcia_weight_matrix, 
    garcia_orig_regions, 
    unified_regions
)

In [None]:
print("Length of unified_regions:", len(unified_regions))
print("Number of regions in merged_regions:", len(merged_regions))
print("First few unified_regions:", list(unified_regions)[:3])
print("First few merged_regions:")
print(merged_regions.head())

In [None]:
import anndata as ad

# Create AnnData objects
adata_meiotic = ad.AnnData(X=meiotic_unified, 
            obs=pd.DataFrame(index=cistopic_obj_meiotic.cell_names), 
            var=pd.DataFrame(index=unified_regions))

adata_garcia = ad.AnnData(X=garcia_unified, 
            obs=pd.DataFrame(index=cistopic_obj_garcia.cell_names), 
            var=pd.DataFrame(index=unified_regions))

# Add some metadata
adata_meiotic.obs['dataset'] = 'meiotic'
adata_garcia.obs['dataset'] = 'garcia'
adata_garcia.obs['celltype'] = cistopic_obj_garcia.cell_data.celltype
adata_meiotic.obs['celltype'] = cistopic_obj_meiotic.cell_data.celltype
adata_garcia = adata_garcia[~adata_garcia.obs.celltype.isna()].copy()
adata_meiotic.obs['celltype'] = cistopic_obj_meiotic.cell_data.celltype
adata_meiotic = adata_meiotic[~adata_meiotic.obs.celltype.isna()].copy()

print(adata_meiotic.shape, adata_garcia.shape)

In [None]:
adata_combined = ad.concat([adata_meiotic, adata_garcia], join='inner')

In [None]:
adata_combined

In [None]:
#adata_combined.X.toarray().max()

In [None]:
import scanpy as sc
# Process the combined data
sc.pp.normalize_total(adata_combined)
sc.pp.log1p(adata_combined)
sc.pp.pca(adata_combined)
sc.pp.neighbors(adata_combined)
sc.tl.umap(adata_combined)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sc.pl.umap(adata_combined, color="dataset", ax=ax1, show=False, title='Datasets')
sc.pl.umap(adata_combined, color="celltype", ax=ax2, show=False, title='Cell Types')
plt.tight_layout()
plt.savefig('/mnt/storage/outputs/garcia_ATAC/outputs/atac_joint_UMAP.svg', format='svg')
plt.show()

In [None]:
# Also do ingest, like in celltype_inget.ipynb to get them nicely aligned.

In [None]:
sc.pp.normalize_total(adata_garcia)
sc.pp.log1p(adata_garcia)
sc.pp.pca(adata_garcia)
sc.pp.neighbors(adata_garcia)
sc.tl.umap(adata_garcia)

In [None]:
sc.pp.normalize_total(adata_meiotic)
sc.pp.log1p(adata_meiotic)
sc.pp.pca(adata_meiotic)
sc.pp.neighbors(adata_meiotic)
sc.tl.umap(adata_meiotic)

In [None]:
adata_meiotic.write_h5ad(DATA_PATH / "adata_meiotic_processed.h5ad")
adata_garcia.write_h5ad(DATA_PATH / "adata_garcia_processed.h5ad")


In [None]:
adata_meiotic = sc.read_h5ad(DATA_PATH / "adata_meiotic_processed.h5ad")
adata_garcia = sc.read_h5ad(DATA_PATH / "adata_garcia_processed.h5ad")

In [None]:
adata_meiotic.shape, adata_garcia.shape

In [None]:
# Ingest the new data into the reference
sc.tl.ingest(adata_meiotic, adata_garcia, obs="celltype")

In [None]:
from matplotlib import pyplot as plt

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sc.pl.umap(adata_garcia, ax=ax1, color = "celltype", show=False, title='Garcia')
sc.pl.umap(adata_meiotic, ax=ax2, color = "celltype", show=False, title='Meiotic')
plt.tight_layout()
plt.savefig('/mnt/storage/outputs/garcia_ATAC/outputs/atac_joint_ingest.svg', format='svg')
plt.show()