In [None]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import scanpy as sc
import anndata
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
load_dotenv()

DATA_PATH = Path('/mnt/windows/extradata/')

In [None]:
# Load RNA atlas
DATA_PATH_2 = Path(os.getenv("DATA_PATH")) / "garcia_ATAC"
#ref_germcell = sc.read_h5ad(DATA_PATH / "atlas/processed_files/E-MTAB-10551/human_germcells_reduced.h5ad")
ref_germcell = sc.read_h5ad(DATA_PATH_2 / 'combined_samples.h5ad')
sc.pp.normalize_total(ref_germcell, target_sum=1e4)
sc.pp.log1p(ref_germcell)

In [None]:
meiotic_cells = sc.read_h5ad(DATA_PATH / 'meiotic_cells/meiotic_merged/preprocessed/merged_preprocessed_data.h5ad')


In [None]:
meiotic_cell_types = pd.read_csv(DATA_PATH / 'meiotic_cells/meiotic_merged/preprocessed/cell_data.csv', index_col=0)

In [None]:
meiotic_cells.obs['celltype'] = meiotic_cell_types['celltype']

In [None]:
meiotic_cells = meiotic_cells[meiotic_cells.obs['celltype'] != "pre_oocyte"]
ref_germcell = ref_germcell[ref_germcell.obs['celltype'] != "pre_oocyte"]

In [None]:
#Aggregate by sum (possibly median instead?)
aggregate_func = "sum"
atlas_pseudobulk = sc.get.aggregate(ref_germcell, by=["celltype"], func=aggregate_func)
meiotic_pseudobulk = sc.get.aggregate(meiotic_cells, by=["celltype"], func=aggregate_func)

In [None]:
atlas_pseudobulk.obs.index

In [None]:
#Convert AnnData objects to pandas DataFrames for correlation analysis
atlas_df = pd.DataFrame(atlas_pseudobulk.layers[aggregate_func], index=atlas_pseudobulk.obs.index + "_atlas", columns=atlas_pseudobulk.var_names)
meiotic_df = pd.DataFrame(meiotic_pseudobulk.layers[aggregate_func], index=meiotic_pseudobulk.obs.index + "_our_meiotic_cells", columns=meiotic_pseudobulk.var_names)


In [None]:
#Concatenate the data from our samples with the atlas data
combined_data = pd.concat([atlas_df, meiotic_df])

In [None]:
combined_data

In [None]:
#Compute the Spearman correlation matrix for the combined dataframe
correlation_matrix_combined = combined_data.T.corr(method='spearman')

In [None]:
correlation_matrix_combined

In [None]:
# Step 6: Plot a heatmap of the combined correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(
	correlation_matrix_combined,
	annot=True,
	fmt=".2f",
	cmap="coolwarm",
	vmin = 0,
	vmax = 1,
	cbar_kws={'label': 'Spearman Correlation'},
	xticklabels=correlation_matrix_combined.columns,
	yticklabels=correlation_matrix_combined.index
)
plt.title("Spearman Correlation Among Cell Types (Both Studies)")
plt.xlabel("Cell Types")
plt.ylabel("Cell Types")
plt.tight_layout()
plt.savefig('/mnt/storage/outputs/garcia_ATAC/outputs/rna_cor.svg', format='svg')
plt.show()
