In [None]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

load_dotenv()

# set working directory

DATA_PATH = Path(os.getenv('DATA_PATH')) / 'garcia_ATAC'


In [None]:
# Load SDRF File
sdrf_file = DATA_PATH / 'E-MTAB-11708.sdrf.txt'
sdrf = pd.read_csv(sdrf_file, sep='\t')

sdrf



In [None]:
sdrf.loc[sdrf['Source Name'] == 'FCA_GND10287600', :]




In [None]:
sdrf.loc[sdrf['Source Name'] == 'FCA_GND10288176', :]


In [None]:
all_samples = sdrf["Source Name"].unique()
all_samples

In [None]:
pair_samples = sdrf["Characteristics[paired library]"].unique()
pair_samples

In [None]:
set(all_samples).symmetric_difference(set(pair_samples))

In [None]:
rna_samples = sdrf.loc[sdrf["Characteristics[protocol]"] == "rna", "Source Name"].unique()
rna_samples

In [None]:
atac_samples = sdrf.loc[sdrf["Characteristics[protocol]"] == "atac", "Source Name"].unique()
atac_samples

In [None]:
set(rna_samples).union(set(atac_samples)) == set(all_samples)

In [None]:
set(rna_samples).intersection(set(atac_samples))

In [None]:
pairs_of_rna_samples = sdrf.loc[sdrf["Source Name"].isin(rna_samples), "Characteristics[paired library]"].unique()
pairs_of_rna_samples


In [None]:
set(pairs_of_rna_samples) == set(atac_samples)

In [None]:
dict_of_matches = sdrf.loc[:, ["Source Name", "Characteristics[paired library]"]].groupby("Source Name").first()

In [None]:
dic = dict(zip(dict_of_matches.index, dict_of_matches["Characteristics[paired library]"]))
dic

In [None]:
# Load one sample and figure out where everything is

# Load fragments file
fragments_file = DATA_PATH / "FCA_GND10288176_and_FCA_GND10287600_atac_fragments.tsv.gz"
fragments = pd.read_csv(fragments_file, sep="\t", header=None, 
                       names=["chrom", "start", "end", "barcode", "count"])
fragments.head()


In [None]:
frag_barcodes = fragments.barcode.unique()

In [None]:
set(frag_barcodes).issuperset(set(adata.obs_names))

In [None]:
per_barcode_metrics = pd.read_csv(DATA_PATH / "FCA_GND10288176_and_FCA_GND10287600_per_barcode_metrics.csv")
per_barcode_metrics.head()

In [None]:
import scanpy as sc
# Load matrix files using scanpy
adata = sc.read_10x_mtx(
    DATA_PATH,
    prefix="FCA_GND10287600_"
)
adata


In [None]:
adata.obs_names

In [None]:
# Load matrix files using scanpy
adata_atac = sc.read_10x_mtx(
    DATA_PATH,
    prefix="FCA_GND10288176_and_FCA_GND10287600_"
)
adata_atac

In [None]:
adata_atac.X.toarray().min(), adata_atac.X.toarray().max() 

In [None]:
adata_atac.obs_names.unique()


In [None]:
set(adata.obs_names.unique()).issubset(set(adata_atac.obs_names.unique()))

In [None]:
per_barcode_metrics.loc[per_barcode_metrics.is_cell == 1, :].shape

In [None]:
set(per_barcode_metrics["gex_barcode"].unique()).issuperset(adata.obs_names)

In [None]:
original_garcia_atlas = sc.read_h5ad("/Users/bogdan/ovelle/data/atlas/processed_files/E-MTAB-10551/human_germcells.h5ad")
original_garcia_atlas


In [None]:
original_garcia_atlas.obs.head()