## Misc

In [1]:
!python3 --version

Python 3.10.4


In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

pd.set_option("display.max_rows", 4)


In [3]:
d_root = "/home/fpavlov/projects/2022-08-tl/"
d_data = f"{d_root}/data/"

# input
d_chipatlas = f"{d_data}/chip-atlas"
f_filelist = f"{d_chipatlas}/fileList.tab"
f_experimentlist = f"{d_chipatlas}/experimentList.tab"

## Parse fileList.tab

In [4]:
filelist_df = (
    pd.read_table(
        f_filelist,
        header=None,
        names=[
            "file_name",
            "genome_assembly",
            "antigen_class",
            "antigen",
            "cell_type_class",
            "cell_type",
            "threshold",
            "experimental_ids",
        ],
    )
    .assign(experimental_ids_list=lambda x: x["experimental_ids"].str.split(","))
    .replace("-", np.nan)
)

print(filelist_df.antigen_class.unique())

filelist_df


['Input control' 'Histone' 'TFs and others' 'ATAC-Seq' 'RNA polymerase'
 'No description' 'DNase-seq' 'Unclassified' 'Bisulfite-Seq']


Unnamed: 0,file_name,genome_assembly,antigen_class,antigen,cell_type_class,cell_type,threshold,experimental_ids,experimental_ids_list
0,InP.ALL.50.AllAg.AllCell,ce10,Input control,,All cell types,,50,"SRX494986,SRX495098,SRX027097,ERX1485064,SRX33...","[SRX494986, SRX495098, SRX027097, ERX1485064, ..."
1,His.ALL.10.AllAg.AllCell,ce10,Histone,,All cell types,,10,"SRX4012478,SRX5402732,SRX5985672,SRX5985668,SR...","[SRX4012478, SRX5402732, SRX5985672, SRX598566..."
...,...,...,...,...,...,...,...,...,...
138342,Pol.YSt.05.AllAg.HQY1660,sacCer3,RNA polymerase,,Yeast strain,HQY1660,05,"SRX3671190,SRX3671191","[SRX3671190, SRX3671191]"
138343,InP.YSt.50.AllAg.Y5795,sacCer3,Input control,,Yeast strain,Y5795,50,,


## Parse experimentList.tab

In [5]:
experimentlist_df = (
    pd.read_table(
        f_experimentlist,
        header=None,
        usecols=range(8),
        names=[
            "experimental_id",
            "genome_assembly",
            "antigen_class",
            "antigen",
            "cell_type_class",
            "cell_type",
            "cell_type_description",
            "processing_logs",
        ],
    )
    .assign(
        no_peaks=lambda x: x["processing_logs"]
        .str.split(",", expand=True)[3]
        .astype(int)
    )
    .drop(columns=["cell_type_description", "processing_logs"])
)

experimentlist_df


Unnamed: 0,experimental_id,genome_assembly,antigen_class,antigen,cell_type_class,cell_type,no_peaks
0,ERX1270342,ce10,Unclassified,Unclassified,Unclassified,Unclassified,0
1,ERX1270343,ce10,Unclassified,Unclassified,Unclassified,Unclassified,818
...,...,...,...,...,...,...,...
437749,SRX957118,sacCer3,Bisulfite-Seq,Bisulfite-Seq,Yeast strain,W303,1925
437750,SRX957119,sacCer3,Bisulfite-Seq,Bisulfite-Seq,Yeast strain,W303,1830


## Get histones

In [6]:
t_genome_assemblies = ["ce11", "mm10", "hg38", "dm6", "sacCer3"]
t_antigen_class = "Histone"
t_no_peaks_thr = 50_000

with pd.option_context("display.max_rows", 10):
    df_filt = (
        # set the initial query with all of the necessary data;
        experimentlist_df.query(
            " and ".join(
                [
                    f"genome_assembly in {t_genome_assemblies}",
                    f'antigen_class == "{t_antigen_class}"',
                    f"{t_no_peaks_thr*1.25} >= no_peaks >= {t_no_peaks_thr*.5}",
                    f"cell_type_class not in  ['Others', 'Unclassified']",
                ]
            )
        )
        .assign(
            no_assemblies_by_cell_type=lambda x: x.groupby(
                ["antigen_class", "antigen", "cell_type_class", "cell_type"]
            )["genome_assembly"].transform("nunique")
        )
        .query("no_assemblies_by_cell_type > 1")
        .sort_values('no_peaks', ascending=False)
        .groupby(["cell_type_class", "cell_type", "antigen", "genome_assembly"]).head(1)
        .sort_values(['antigen', 'cell_type_class', 'cell_type', 'genome_assembly', 'no_peaks'])
        .reset_index(drop=True)
    )

    print(
        f'Unique assemblies: {", ".join(sorted(df_filt.genome_assembly.unique()))}',
        end=".\n",
    )
    print(f'Unique antigens: {", ".join(sorted(df_filt.antigen.unique()))}', end=".\n")
    print(
        f'Unique cell type classes: {", ".join(sorted(df_filt.cell_type_class.unique()))}',
        end=".\n",
    )

    display(df_filt)


Unique assemblies: hg38, mm10.
Unique antigens: H3K27ac, H3K4me1, H3K4me2, H3K4me3, H3K9me3.
Unique cell type classes: Blood, Breast, Cardiovascular, Liver, Lung, Neural, Pancreas, Pluripotent stem cell, Prostate.


Unnamed: 0,experimental_id,genome_assembly,antigen_class,antigen,cell_type_class,cell_type,no_peaks,no_assemblies_by_cell_type
0,SRX2339190,hg38,Histone,H3K27ac,Blood,B cells,28565,2
1,SRX753157,mm10,Histone,H3K27ac,Blood,B cells,48873,2
2,SRX212429,hg38,Histone,H3K27ac,Blood,CD4+ T cells,40418,2
3,SRX3067412,mm10,Histone,H3K27ac,Blood,CD4+ T cells,47268,2
4,SRX183902,hg38,Histone,H3K27ac,Blood,CD8+ T cells,26830,2
...,...,...,...,...,...,...,...,...
103,SRX283742,mm10,Histone,H3K4me3,Prostate,Prostate,32213,2
104,SRX5763011,hg38,Histone,H3K9me3,Pluripotent stem cell,ES cells,26527,2
105,DRX013342,mm10,Histone,H3K9me3,Pluripotent stem cell,ES cells,58019,2
106,SRX3095325,hg38,Histone,H3K9me3,Pluripotent stem cell,iPS cells,31184,2


In [None]:
# import tempfile

# cmd_list = ''
# for _,row in df_filt.iterrows():
#     cmd = f"-P ../data/raw/{row.antigen_class[:3]}.{row.antigen}.{row.cell_type_class.replace(' ', '_')}.{row.cell_type.replace(' ', '_')}.{row.genome_assembly} http://dbarchive.biosciencedbc.jp/kyushu-u/{row.genome_assembly}/eachData/bed05/{row.experimental_id}.05.bed"
#     cmd_list += cmd + '\n'

# with tempfile.TemporaryDirectory() as tmpdir:
#     with open(f"{tmpdir}/cmd_list.tmp", 'w') as f_out:
#         f_out.write(cmd_list)
#     !cat {tmpdir}/cmd_list.tmp | xargs -n 3 -P 8 wget -c -q

## Get TFs

In [14]:
t_genome_assemblies = ["ce11", "mm10", "hg38", "dm6", "sacCer3"]
t_antigen_class = "TFs and others"
t_no_peaks_thr = 50_000

with pd.option_context("display.max_rows", 10):
    df_filt = (
        experimentlist_df.query(
            " and ".join(
                [
                    f"genome_assembly in {t_genome_assemblies}",
                    f'antigen_class == "{t_antigen_class}"',
                    f"{t_no_peaks_thr*1.25} >= no_peaks >= {t_no_peaks_thr*.5}",
                    f"cell_type_class not in  ['Others', 'Unclassified']",
                ]
            )
        )
        .assign(
            antigen= lambda x: x.antigen.str.upper(),
            no_assemblies_by_cell_type=lambda x: x.groupby(
                ["antigen_class", "antigen", "cell_type_class", "cell_type"]
            )["genome_assembly"].transform("nunique")
        )
        .query("no_assemblies_by_cell_type > 1")
        .sort_values('no_peaks', ascending=False)
        .groupby(["cell_type_class", "cell_type", "antigen", "genome_assembly"]).head(1)
        .sort_values(['antigen', 'cell_type_class', 'cell_type', 'genome_assembly', 'no_peaks'])
        .reset_index(drop=True)
    )

    print(
        f'Unique assemblies: {", ".join(sorted(df_filt.genome_assembly.unique()))}',
        end=".\n",
    )
    print(f'Unique antigens: {", ".join(sorted(df_filt.antigen.unique()))}', end=".\n")
    print(
        f'Unique cell type classes: {", ".join(sorted(df_filt.cell_type_class.unique()))}',
        end=".\n",
    )

    display(df_filt)


Unique assemblies: hg38, mm10.
Unique antigens: CTCF, RAD21, SPI1, TBX21.
Unique cell type classes: Blood, Kidney, Lung, Neural, Pluripotent stem cell.


Unnamed: 0,experimental_id,genome_assembly,antigen_class,antigen,cell_type_class,cell_type,no_peaks,no_assemblies_by_cell_type
0,SRX4284008,hg38,TFs and others,CTCF,Blood,B cells,49829,2
1,SRX3753608,mm10,TFs and others,CTCF,Blood,B cells,60907,2
2,SRX5299873,hg38,TFs and others,CTCF,Blood,Erythroid Cells,56097,2
3,SRX2741819,mm10,TFs and others,CTCF,Blood,Erythroid Cells,35385,2
4,SRX188963,hg38,TFs and others,CTCF,Kidney,Kidney,36808,2
...,...,...,...,...,...,...,...,...
19,SRX197253,mm10,TFs and others,SPI1,Blood,Macrophages,62298,2
20,SRX5574348,hg38,TFs and others,SPI1,Blood,Neutrophils,39479,2
21,SRX3723675,mm10,TFs and others,SPI1,Blood,Neutrophils,56599,2
22,SRX1799591,hg38,TFs and others,TBX21,Blood,Th1 Cells,54426,2


## Get peak count for each histone mark

In [77]:
(
    filelist_df.query(
        f'genome_assembly in ["ce11", "mm10", "hg38", "dm6", "sacSer3"]'
        f'and antigen_class=="Histone"'
        f'and threshold=="05"'
    )
    .dropna(subset=['experimental_ids_list'])
    .drop(columns=['file_name', 'threshold', 'experimental_ids'])
    .explode('experimental_ids_list')
    .rename(columns={"experimental_ids_list": "experimental_id"})
    # .merge(
    #     experimentlist_df,
    #     on=["genome_assembly", "antigen_class", "antigen", "experimental_id", ],
    # )
)


Unnamed: 0,genome_assembly,antigen_class,antigen,cell_type_class,cell_type,experimental_id
2726,ce11,Histone,,Embryo,,SRX466530
2726,ce11,Histone,,Embryo,,SRX466529
...,...,...,...,...,...,...
86825,hg38,Histone,H3K4me2,All cell types,,SRX2085879
86825,hg38,Histone,H3K4me2,All cell types,,SRX4080216


In [72]:
total_peaks_by_histone = (
    filelist_df.dropna(subset=["antigen", "experimental_ids"], how="any")
    .assign(no_experiments=lambda x: x["experimental_ids_list"].apply(len))
    .query(
        '\
        genome_assembly.isin(["hg38", "mm10", "ce11", "dm6", "sacSer3"]) \
        and antigen_class=="Histone" \
        and cell_type_class=="All cell types" \
        and threshold=="05" \
        '
    )
    .explode("experimental_ids_list")
    .rename(columns={"experimental_ids_list": "experimental_id"})
    .merge(
        experimentlist_df,
        on=["experimental_id", "genome_assembly", "antigen_class", "antigen",],
    )
    .assign(
        no_peaks_total=lambda x: x.groupby(
            [
                "file_name",
                "genome_assembly",
                "antigen_class",
                "antigen",
                "experimental_ids"
            ]
        )["no_peaks"].transform("sum")
    )
    .drop_duplicates(['file_name', 'genome_assembly', 'antigen_class', 'antigen', 'experimental_ids', 'no_peaks_total'])
    .pivot(
        index=["antigen"], columns=["genome_assembly"], values=["no_peaks_total"]
    )
    .fillna(0)
    .astype(int)
    .assign(
        total_assemblies=lambda x: x.apply(np.count_nonzero, axis=1),
        total_peaks=lambda x: x.apply(sum, axis=1),
    )
    # .sort_values(["total_assemblies", "total_peaks"], ascending=[False, False])
    .sort_index()
)
total_peaks_by_histone

Unnamed: 0_level_0,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,total_assemblies,total_peaks
genome_assembly,ce11,dm6,hg38,mm10,Unnamed: 5_level_1,Unnamed: 6_level_1
antigen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
CENPA,0,0,157518,0,1,157519
H1,0,30201,0,1690,2,31893
...,...,...,...,...,...,...
macroH2A1.2,0,0,6511,142449,2,148962
macroH2A2,0,0,18494,36898,2,55394


## Get list of rare histone marks

In [8]:
total_experiments_by_histone = (
    filelist_df.dropna(subset=["antigen", "experimental_ids"], how="any")
    .assign(no_experiments=lambda x: x["experimental_ids_list"].apply(len))
    .query(
        '\
        genome_assembly.isin(["hg38", "mm10", "ce11", "dm6", "sacSer3"]) \
        and antigen_class=="Histone" \
        and cell_type_class=="All cell types" \
        and threshold=="05" \
        '
    )
    .pivot(
        index=["antigen"], columns=["genome_assembly"], values=["no_experiments"]
    )
    .fillna(0)
    .astype(int)
    .assign(
        total_assemblies=lambda x: x.apply(np.count_nonzero, axis=1),
        total_experiments=lambda x: x.apply(sum, axis=1),
    )
    # .sort_values(["total_assemblies", "total_experiments"], ascending=[False, False])
    .sort_index()
)
total_experiments_by_histone

Unnamed: 0_level_0,no_experiments,no_experiments,no_experiments,no_experiments,total_assemblies,total_experiments
genome_assembly,ce11,dm6,hg38,mm10,Unnamed: 5_level_1,Unnamed: 6_level_1
antigen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
CENPA,0,0,17,0,1,18
H1,0,15,0,4,2,21
...,...,...,...,...,...,...
macroH2A1.2,0,0,6,2,2,10
macroH2A2,0,0,2,10,2,14


In [45]:
with pd.option_context('display.max_rows', 10):
    display(
        pd.concat([total_experiments_by_histone, total_peaks_by_histone], axis=1)
    )

Unnamed: 0_level_0,no_experiments,no_experiments,no_experiments,no_experiments,total_assemblies,total_experiments,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,total_assemblies,total_peaks
genome_assembly,ce11,dm6,hg38,mm10,Unnamed: 5_level_1,Unnamed: 6_level_1,ce11,dm6,hg38,mm10,Unnamed: 11_level_1,Unnamed: 12_level_1
antigen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
CENPA,0,0,17,0,1,18,0,0,157518,0,1,157519
H1,0,15,0,4,2,21,0,30201,0,1690,2,31893
H1.0,0,0,1,0,1,2,0,0,47849,0,1,47850
H1.2,0,0,11,0,1,12,0,0,31523,0,1,31524
H1.4,0,0,1,0,1,2,0,0,903,0,1,904
...,...,...,...,...,...,...,...,...,...,...,...,...
htz-1,9,0,0,0,1,10,69958,0,0,0,1,69959
macroH2A,0,0,0,1,1,2,0,0,0,726,1,727
macroH2A1,0,0,21,14,2,37,0,0,62286,282239,2,344527
macroH2A1.2,0,0,6,2,2,10,0,0,6511,142449,2,148962


## Get peaks by cell type

In [44]:
with pd.option_context("display.max_rows", 10, "display.max.columns", 10):
    display(
        filelist_df.dropna(subset=["antigen", "experimental_ids"], how="any")
        .assign(no_experiments=lambda x: x["experimental_ids_list"].apply(len))
        .query(
            '\
            genome_assembly.isin(["hg38", "mm10", "ce11", "dm6", "sacSer3"]) \
            and antigen_class=="Histone" \
            and cell_type_class=="All cell types" \
            and threshold=="05" \
            '
        )
        .explode("experimental_ids_list")
        .rename(columns={"experimental_ids_list": "experimental_id"})
        .merge(
            experimentlist_df,
            on=["experimental_id", "genome_assembly", "antigen_class", "antigen"],
        )
        .assign(
            no_peaks_total=lambda x: x.groupby(
                [
                    "file_name",
                    "genome_assembly",
                    "antigen_class",
                    "antigen",
                    "experimental_ids",
                    "cell_type_class_y",
                ]
            )["no_peaks"].transform("sum")
        )
        .drop_duplicates(
            [
                "file_name",
                "genome_assembly",
                "antigen_class",
                "antigen",
                "experimental_ids",
                "cell_type_class_y",
                "no_peaks_total",
            ]
        )
        .pivot_table(
            index=["antigen"], columns=["cell_type_class_y", "genome_assembly"], values=["no_peaks_total"]
        )
        .fillna(0)
        .astype(int)
        # .assign(
        #     total_assemblies=lambda x: x.apply(np.count_nonzero, axis=1),
        #     total_peaks=lambda x: x.apply(sum, axis=1),
        # )
        .sort_index()
    )


Unnamed: 0_level_0,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total,no_peaks_total
cell_type_class_y,Adipocyte,Adipocyte,Adult,Adult,Blood,...,Unclassified,Unclassified,Unclassified,Uterus,Uterus
genome_assembly,hg38,mm10,ce11,dm6,hg38,...,dm6,hg38,mm10,hg38,mm10
antigen,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3
CENPA,0,0,0,0,22380,...,0,0,0,120535,0
H1,0,0,0,4785,0,...,0,0,0,0,0
H1.0,0,0,0,0,0,...,0,0,0,0,0
H1.2,0,0,0,0,20647,...,0,0,0,0,0
H1.4,0,0,0,0,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
htz-1,0,0,0,0,0,...,0,0,0,0,0
macroH2A,0,0,0,0,0,...,0,0,0,0,0
macroH2A1,0,0,0,0,11193,...,0,0,0,3587,0
macroH2A1.2,0,0,0,0,6511,...,0,0,0,0,0


## Get list of rare TFs

In [10]:
rare_tfs_df = (
    filelist_df.dropna(subset=["antigen", "experimental_ids"], how="any")
    .assign(no_experiments=lambda x: x["experimental_ids_list"].apply(len))
    .query(
        '\
        genome_assembly.isin(["hg38", "mm10", "ce11", "dm6", "sacSer3"]) \
        and antigen_class=="TFs and others" \
        and cell_type_class=="All cell types" \
        and threshold=="05" \
        '
    )
    .pivot(index=["antigen"], columns=["genome_assembly"], values=["no_experiments"])
    .fillna(0)
    .astype(int)
    .droplevel(0, axis=1)
    .rename_axis(None, axis=1)
    .reset_index()
    .assign(antigen=lambda x: x["antigen"].str.upper())
    .groupby(["antigen"])
    .sum()
    .assign(
        total_assemblies=lambda x: x.apply(np.count_nonzero, axis=1),
        total_experiments=lambda x: x.apply(sum, axis=1),
    )
    .sort_values(["total_assemblies", "total_experiments"], ascending=[False, False])
)

with pd.option_context("display.max_rows", 4):
    display(rare_tfs_df)


Unnamed: 0_level_0,ce11,dm6,hg38,mm10,total_assemblies,total_experiments
antigen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
EPITOPE TAGS,9,78,751,969,4,1811
GFP,89,66,280,136,4,575
...,...,...,...,...,...,...
ZNF92,0,0,1,0,1,2
ZSCAN29,0,0,1,0,1,2


In [11]:
(
    filelist_df.dropna(subset=["antigen", "experimental_ids"], how="any")
    .assign(no_experiments=lambda x: x["experimental_ids_list"].apply(len))
    .query(
        '\
            genome_assembly.isin(["hg38", "mm10", "ce11", "dm6", "sacSer3"]) \
            and antigen_class=="TFs and others" \
            and cell_type_class=="All cell types" \
            and threshold=="05" \
            '
    )
    .pivot(index=["antigen"], columns=["genome_assembly"], values=["no_experiments"])
    .fillna(0)
    .astype(int)
    .assign(
        total_assemblies=lambda x: x.apply(np.count_nonzero, axis=1),
        total_experiments=lambda x: x.apply(sum, axis=1),
    )
    .sort_values(["total_assemblies", "total_experiments"], ascending=[False, False])
)


Unnamed: 0_level_0,no_experiments,no_experiments,no_experiments,no_experiments,total_assemblies,total_experiments
genome_assembly,ce11,dm6,hg38,mm10,Unnamed: 5_level_1,Unnamed: 6_level_1
antigen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Epitope tags,9,78,751,969,4,1811
GFP,89,66,280,136,4,575
...,...,...,...,...,...,...
trem,0,1,0,0,1,2
woc,0,1,0,0,1,2


## Histone marks by cell type

In [12]:
with pd.option_context("display.max_rows", 10, "display.max_columns", 10):
    display(
        filelist_df.dropna(subset=["antigen", "experimental_ids"], how="any")
        .assign(no_experiments=lambda x: x["experimental_ids_list"].apply(len))
        .query(
            '\
            genome_assembly.isin(["hg38", "mm10", "ce11", "dm6", "sacSer3"]) \
            and antigen_class=="Histone" \
            and cell_type_class != "All cell types" \
            and threshold=="05" \
        '
        )
        .groupby(["antigen", "cell_type_class"])["genome_assembly"]
        .nunique()
        .to_frame()
        .reset_index()
        # .groupby(['antigen', 'cell_type_class'])['no_experiments'].sum().to_frame().reset_index()
        .pivot(
            index=["antigen"], columns=["cell_type_class"], values=["genome_assembly"]
        )
        .droplevel(0, axis=1)
        .rename_axis(None, axis=1)
        .assign(total=lambda x: x.sum(axis=1))
        .fillna('')
        .sort_values('total', ascending=False)
    )


Unnamed: 0_level_0,Adipocyte,Adult,Blood,Bone,Breast,...,Pupae,Spleen,Unclassified,Uterus,total
antigen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
H3K27me3,2.0,2.0,2.0,2.0,2.0,...,1.0,1.0,3.0,2.0,52.0
H3K4me1,2.0,2.0,2.0,2.0,2.0,...,1.0,1.0,3.0,2.0,52.0
H3K4me3,2.0,2.0,2.0,2.0,2.0,...,1.0,1.0,3.0,2.0,52.0
H3K27ac,2.0,2.0,2.0,2.0,2.0,...,1.0,1.0,3.0,2.0,52.0
H3K36me3,2.0,2.0,2.0,2.0,2.0,...,,1.0,3.0,1.0,47.0
...,...,...,...,...,...,...,...,...,...,...,...
H3K23me2,,1.0,,,,...,,,,,1.0
H3K23me3,,1.0,,,,...,,,,,1.0
H3Kac,,,1.0,,,...,,,,,1.0
H2A.Bbd1,,,,,,...,,,,,1.0


In [13]:
cell_type_class_list = [
    "Adipocyte",
    "Adult",
    "Blood",
    "Bone",
    "Breast",
    "Cardiovascular",
    "Cell line",
    "Digestive tract",
    "Embryo",
    "Embryonic fibroblast",
    "Epidermis",
    "Gonad",
    "Kidney",
    "Larvae",
    "Liver",
    "Lung",
    "Muscle",
    "Neural",
    "No description",
    "Others",
    "Pancreas",
    "Placenta",
    "Pluripotent stem cell",
    "Prostate",
    "Pupae",
    "Spleen",
    "Unclassified",
    "Uterus",
]


with pd.option_context("display.max_rows", None):
    for cell_type_class in cell_type_class_list:
        display(
            filelist_df.dropna(subset=["experimental_ids_list"])
            .assign(no_experiments=lambda x: x.experimental_ids_list.apply(len))
            .query(
                'genome_assembly.isin(["mm10", "hg38", "dm6", "ce11"]) \
                and cell_type_class==@cell_type_class \
                and antigen_class=="Histone" \
                and antigen=="H3K27me3" \
                and threshold=="05"'
            )
            .loc[
                :,
                [
                    "file_name",
                    "genome_assembly",
                    "antigen",
                    "cell_type_class",
                    "experimental_ids_list",
                    "no_experiments",
                ],
            ]
        )


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
75802,His.Adp.05.H3K27me3.AllCell,hg38,H3K27me3,Adipocyte,"[SRX019510, SRX019496, SRX019503, SRX019517]",4
104409,His.Adp.05.H3K27me3.AllCell,mm10,H3K27me3,Adipocyte,"[SRX5029579, SRX5029576, SRX5029578, SRX502958...",24


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
4929,His.Adl.05.H3K27me3.AllCell,ce11,H3K27me3,Adult,"[SRX7971795, SRX7971791, SRX076079, SRX076078,...",18
10685,His.Adl.05.H3K27me3.AllCell,dm6,H3K27me3,Adult,"[SRX699107, SRX5343144, SRX5343152, SRX5343146...",54


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
64332,His.Bld.05.H3K27me3.AllCell,hg38,H3K27me3,Blood,"[SRX749790, SRX6608388, SRX764389, SRX2612175,...",558
97234,His.Bld.05.H3K27me3.AllCell,mm10,H3K27me3,Blood,"[SRX2315739, SRX2315748, SRX3384701, SRX338470...",533


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
74477,His.Bon.05.H3K27me3.AllCell,hg38,H3K27me3,Bone,"[SRX3791922, SRX2256182, SRX096367, SRX5805565...",32
94392,His.Bon.05.H3K27me3.AllCell,mm10,H3K27me3,Bone,"[SRX1035111, SRX3791886, SRX3791887, DRX111901...",9


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
59009,His.Brs.05.H3K27me3.AllCell,hg38,H3K27me3,Breast,"[SRX387611, SRX1023529, SRX539651, SRX387604, ...",194
96910,His.Brs.05.H3K27me3.AllCell,mm10,H3K27me3,Breast,"[SRX2356738, SRX2356741, SRX2356743, SRX235673...",68


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
63400,His.CDV.05.H3K27me3.AllCell,hg38,H3K27me3,Cardiovascular,"[SRX8030138, SRX4947701, SRX4947702, SRX860893...",44
104266,His.CDV.05.H3K27me3.AllCell,mm10,H3K27me3,Cardiovascular,"[SRX3855842, SRX373602, SRX305920, SRX305918, ...",73


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
15403,His.CeL.05.H3K27me3.AllCell,dm6,H3K27me3,Cell line,"[SRX3170975, SRX2399660, SRX193339, SRX2399661...",40


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
60220,His.Dig.05.H3K27me3.AllCell,hg38,H3K27me3,Digestive tract,"[SRX1568634, SRX610762, SRX1568635, SRX610770,...",69
98487,His.Dig.05.H3K27me3.AllCell,mm10,H3K27me3,Digestive tract,"[SRX4047592, SRX4047635, SRX4047603, SRX404759...",54


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
3900,His.Emb.05.H3K27me3.AllCell,ce11,H3K27me3,Embryo,"[SRX466538, SRX4082371, SRX466537, SRX4082370]",4
10626,His.Emb.05.H3K27me3.AllCell,dm6,H3K27me3,Embryo,"[SRX110776, SRX110778, SRX110777, SRX110779, S...",75
54445,His.Emb.05.H3K27me3.AllCell,hg38,H3K27me3,Embryo,"[SRX5066992, SRX5066991]",2
105763,His.Emb.05.H3K27me3.AllCell,mm10,H3K27me3,Embryo,"[SRX149175, SRX2320707, SRX2320731, SRX1162722...",303


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
90160,His.EmF.05.H3K27me3.AllCell,mm10,H3K27me3,Embryonic fibroblast,"[DRX012094, DRX012092, DRX012091, DRX012093, D...",193


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
76835,His.Epd.05.H3K27me3.AllCell,hg38,H3K27me3,Epidermis,"[SRX971592, SRX1184136, SRX1184124, SRX1295725...",96
102474,His.Epd.05.H3K27me3.AllCell,mm10,H3K27me3,Epidermis,"[SRX5443903, SRX5443906, SRX5443909, SRX544390...",41


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
72915,His.Gon.05.H3K27me3.AllCell,hg38,H3K27me3,Gonad,"[SRX7775613, SRX7775609, SRX7775611, SRX452982...",23
100497,His.Gon.05.H3K27me3.AllCell,mm10,H3K27me3,Gonad,"[SRX332346, SRX1060572, SRX332348, SRX207548, ...",132


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
81897,His.Kid.05.H3K27me3.AllCell,hg38,H3K27me3,Kidney,"[SRX6369406, SRX189943, SRX1458752, SRX1458753...",38
102323,His.Kid.05.H3K27me3.AllCell,mm10,H3K27me3,Kidney,"[SRX6446217, SRX6446218, SRX3127902, SRX312789...",18


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
4094,His.Lar.05.H3K27me3.AllCell,ce11,H3K27me3,Larvae,"[SRX5402704, SRX5402703, SRX4082376, SRX466482...",16
12509,His.Lar.05.H3K27me3.AllCell,dm6,H3K27me3,Larvae,"[SRX104968, SRX331379, SRX2564137, SRX104967, ...",32


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
69168,His.Liv.05.H3K27me3.AllCell,hg38,H3K27me3,Liver,"[SRX3178422, SRX3178423, SRX3178427, SRX317842...",47
89610,His.Liv.05.H3K27me3.AllCell,mm10,H3K27me3,Liver,"[SRX4616078, SRX4616081, SRX6922691, SRX692269...",97


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
79134,His.Lng.05.H3K27me3.AllCell,hg38,H3K27me3,Lung,"[SRX1293676, SRX1293675, SRX1756907, SRX317855...",137
94767,His.Lng.05.H3K27me3.AllCell,mm10,H3K27me3,Lung,"[SRX2531392, SRX5646074, SRX4343050, SRX434306...",24


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
81071,His.Myo.05.H3K27me3.AllCell,hg38,H3K27me3,Muscle,"[SRX1878895, SRX7070025, SRX7070024, SRX199840...",17
101512,His.Myo.05.H3K27me3.AllCell,mm10,H3K27me3,Muscle,"[SRX101697, SRX286493, SRX286497, SRX5874017, ...",40


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
76355,His.Neu.05.H3K27me3.AllCell,hg38,H3K27me3,Neural,"[SRX1694161, SRX4221961, SRX2166903, SRX422196...",181
101285,His.Neu.05.H3K27me3.AllCell,mm10,H3K27me3,Neural,"[SRX252184, SRX4999179, SRX4999180, SRX5491864...",233


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
66732,His.Oth.05.H3K27me3.AllCell,hg38,H3K27me3,Others,"[SRX2931083, SRX152827, SRX152824, SRX152826, ...",37
108543,His.Oth.05.H3K27me3.AllCell,mm10,H3K27me3,Others,"[SRX228664, SRX4107507, SRX1471731, SRX1471732...",49


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
63517,His.Pan.05.H3K27me3.AllCell,hg38,H3K27me3,Pancreas,"[ERX2078817, ERX2078805, ERX2078839, SRX170757...",41
101070,His.Pan.05.H3K27me3.AllCell,mm10,H3K27me3,Pancreas,"[SRX3710135, SRX3710136, SRX3710129, SRX581736...",31


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
78712,His.Plc.05.H3K27me3.AllCell,hg38,H3K27me3,Placenta,"[SRX5443236, SRX5443234, SRX5443235, SRX5443233]",4
106634,His.Plc.05.H3K27me3.AllCell,mm10,H3K27me3,Placenta,"[SRX160407, SRX4546334, SRX4546332, SRX4546338...",35


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
78698,His.PSC.05.H3K27me3.AllCell,hg38,H3K27me3,Pluripotent stem cell,"[SRX1053925, SRX1053930, SRX3511930, SRX105399...",227
100608,His.PSC.05.H3K27me3.AllCell,mm10,H3K27me3,Pluripotent stem cell,"[SRX4733384, SRX186067, SRX4338257, SRX4338259...",962


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
66000,His.Prs.05.H3K27me3.AllCell,hg38,H3K27me3,Prostate,"[SRX8326235, SRX276787, SRX160732, SRX539663, ...",155


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
14170,His.Pup.05.H3K27me3.AllCell,dm6,H3K27me3,Pupae,"[SRX041396, SRX3511942, SRX3511948, SRX013085]",4


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
89802,His.Spl.05.H3K27me3.AllCell,mm10,H3K27me3,Spleen,"[SRX5006207, SRX273302, ERX1805535, ERX1805526...",12


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
13663,His.Unc.05.H3K27me3.AllCell,dm6,H3K27me3,Unclassified,"[SRX110781, SRX110782, ERX011419, ERX011420, S...",47
83816,His.Unc.05.H3K27me3.AllCell,hg38,H3K27me3,Unclassified,"[SRX4221916, SRX4221889, SRX4221907, SRX422188...",90
103600,His.Unc.05.H3K27me3.AllCell,mm10,H3K27me3,Unclassified,"[SRX333551, SRX206135, SRX206137, SRX3979850, ...",83


Unnamed: 0,file_name,genome_assembly,antigen,cell_type_class,experimental_ids_list,no_experiments
85164,His.Utr.05.H3K27me3.AllCell,hg38,H3K27me3,Uterus,"[SRX2158965, SRX2158974, SRX378015, SRX524979,...",39
109318,His.Utr.05.H3K27me3.AllCell,mm10,H3K27me3,Uterus,"[SRX3306701, SRX3306715, SRX3306713, SRX330670...",22
