In [1]:
import pandas as pd
from glob import glob
import os
# import pyBigWig
import numpy as np
from scipy import stats
from collections import defaultdict
from collections import Counter
from operator import itemgetter

In [2]:
# change this to point at the plant multidap data directory on your filesystem
# data_base_dir = '/clusterfs/jgi/groups/gentech/seqtech/plant_multidap_data'

In [3]:
ortho_table = pd.read_csv('/clusterfs/jgi/groups/gentech/seqtech/plant_multidap_data/orthofinder/plant_4sp_orthofinder/ath-bol-crub-alyr-4species_orthology_table.tsv', sep='\t')

display(ortho_table.head(5))

ortho_table_by_species = {}
for species, group in ortho_table.groupby('species'):
    ortho_table_by_species[species] = group
    
orthogroup_sizes = ortho_table.groupby(['species', 'orthogroup'])['gene'].agg('count').sort_index()
display(orthogroup_sizes)

n_species_by_orthogroup = ortho_table.groupby('orthogroup')['species'].agg('nunique')
display(n_species_by_orthogroup)

organelle_orthogroup_file = os.path.join('/clusterfs/jgi/groups/gentech/homes/amoralescruz/multidap/N10/peak_annotations/organelle_OGs/10sp_organelle_orthogroups.txt')
organelle_orthogroups = pd.read_csv(organelle_orthogroup_file, header=None, names=['orthogroup'])
organelle_orthogroups = organelle_orthogroups['orthogroup']

Unnamed: 0,species,orthogroup,protein_id,single_copy_ortho,gene
0,Brassica_oleracea_TO1000DH3,OG0000000,XP_013583255.1,False,LOC106292183
1,Brassica_oleracea_TO1000DH3,OG0000000,XP_013583261.1,False,LOC106292191
2,Brassica_oleracea_TO1000DH3,OG0000000,XP_013583306.1,False,LOC106292247
3,Brassica_oleracea_TO1000DH3,OG0000000,XP_013583330.1,False,LOC106292274
4,Brassica_oleracea_TO1000DH3,OG0000000,XP_013583428.1,False,LOC106292382


species                         orthogroup
Arabidopsis_lyrata_MN47         OG0000003     2
                                OG0000004     1
                                OG0000005     1
                                OG0000007     1
                                OG0000009     1
                                             ..
Capsella_rubella_Monte_Gargano  OG0022947     2
                                OG0022948     2
                                OG0022949     2
                                OG0022950     2
                                OG0022951     2
Name: gene, Length: 82230, dtype: int64

orthogroup
OG0000000    1
OG0000001    1
OG0000002    2
OG0000003    2
OG0000004    2
            ..
OG0022947    1
OG0022948    1
OG0022949    1
OG0022950    1
OG0022951    1
Name: species, Length: 22952, dtype: int64

In [4]:
ortho_table['species'].unique()

array(['Brassica_oleracea_TO1000DH3', 'Arabidopsis_thaliana_Col-0',
       'Arabidopsis_lyrata_MN47', 'Capsella_rubella_Monte_Gargano'],
      dtype=object)

In [5]:
##### peak multiN10 dataset
annt_peaks = '/clusterfs/jgi/groups/gentech/homes/amoralescruz/multidap/2_AtAlCrBo/gene_assignment/annotated_tsvs/'

peak_files_annotated = []

for f in glob(os.path.join(annt_peaks, '*_assigned_genes_mRNA_annotated.tsv')):
    f_split = os.path.basename(f).split('_', maxsplit=4)
    tf = f_split[0]
    library_name = os.path.basename(f).replace('_assigned_genes_mRNA_annotated.tsv', '')
    species = '_'.join(itemgetter(*[1,2,3])(f_split))
    species = species.replace('A_lyrata_MN47', 'Arabidopsis_lyrata_MN47')
    species = species.replace('A_thaliana_Col-0', 'Arabidopsis_thaliana_Col-0')
    species = species.replace('B_oleracea_TO1000DH3', 'Brassica_oleracea_TO1000DH3')
    species = species.replace('C_rubella_Monte', 'Capsella_rubella_Monte_Gargano')

    
    peak_files_annotated.append({
        'species': species,
        'tf': tf,
        'library_name': library_name,
        'annotated_peak_file': f
    })

peak_files_annotated_df = pd.DataFrame(peak_files_annotated)

# tfs_passed_all_filters = pd.read_csv('/clusterfs/jgi/groups/gentech/homes/amoralescruz/multidap/tfs_passed_all_filters.txt', header=None, names=['tf'])
#tfs_passed_all_filters = pd.read_csv('/clusterfs/jgi/groups/gentech/homes/amoralescruz/multidap/N10/peak_annotations/N10_T1_tfs.txt', header=None, names=['tf'])

#peak_files_annotated_df = peak_files_annotated_df[peak_files_annotated_df['tf'].isin(tfs_passed_all_filters['tf'])]

peak_files_annotated_df.head(20)

Unnamed: 0,species,tf,library_name,annotated_peak_file
0,Arabidopsis_lyrata_MN47,AT1G01060,AT1G01060_A_lyrata_MN47_DAPi709D10-ORGi5038-96...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
1,Arabidopsis_thaliana_Col-0,AT1G01060,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
2,Brassica_oleracea_TO1000DH3,AT1G01060,AT1G01060_B_oleracea_TO1000DH3_DAPi709D10-ORGi...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
3,Capsella_rubella_Monte_Gargano,AT1G01060,AT1G01060_C_rubella_Monte_Gargano_DAPi709D10-O...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
4,Arabidopsis_lyrata_MN47,AT1G01250,AT1G01250_A_lyrata_MN47_DAPi710B07-ORGi5038-96...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
5,Arabidopsis_thaliana_Col-0,AT1G01250,AT1G01250_A_thaliana_Col-0_DAPi710B07-ORGi5005...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
6,Brassica_oleracea_TO1000DH3,AT1G01250,AT1G01250_B_oleracea_TO1000DH3_DAPi710B07-ORGi...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
7,Capsella_rubella_Monte_Gargano,AT1G01250,AT1G01250_C_rubella_Monte_Gargano_DAPi710B07-O...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
8,Arabidopsis_lyrata_MN47,AT1G01720,AT1G01720_A_lyrata_MN47_DAPi709F09-ORGi5038-96...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
9,Arabidopsis_thaliana_Col-0,AT1G01720,AT1G01720_A_thaliana_Col-0_DAPi709F09-ORGi5005...,/clusterfs/jgi/groups/gentech/homes/amoralescr...


## Selecting only TFs from the allowlist (TFs after FRIP, coverage and other filters)

In [6]:
allowlist = pd.read_csv('/clusterfs/jgi/groups/gentech/seqtech/plant_multidap_data/library_filtering/library_allowlist.tsv', sep='\t')
allowlist_libnames = allowlist['library_name'].tolist()


peak_files_annotated_df = peak_files_annotated_df[peak_files_annotated_df['library_name'].isin(allowlist_libnames)]
display(peak_files_annotated_df)

Unnamed: 0,species,tf,library_name,annotated_peak_file
0,Arabidopsis_lyrata_MN47,AT1G01060,AT1G01060_A_lyrata_MN47_DAPi709D10-ORGi5038-96...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
1,Arabidopsis_thaliana_Col-0,AT1G01060,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
2,Brassica_oleracea_TO1000DH3,AT1G01060,AT1G01060_B_oleracea_TO1000DH3_DAPi709D10-ORGi...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
3,Capsella_rubella_Monte_Gargano,AT1G01060,AT1G01060_C_rubella_Monte_Gargano_DAPi709D10-O...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
4,Arabidopsis_lyrata_MN47,AT1G01250,AT1G01250_A_lyrata_MN47_DAPi710B07-ORGi5038-96...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
...,...,...,...,...
1466,Capsella_rubella_Monte_Gargano,AT5G67300,AT5G67300_C_rubella_Monte_Gargano_DAPi711B01-O...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
1467,Arabidopsis_lyrata_MN47,AT5G67580,AT5G67580_A_lyrata_MN47_DAPi711D05-ORGi5038-96...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
1468,Arabidopsis_thaliana_Col-0,AT5G67580,AT5G67580_A_thaliana_Col-0_DAPi711D05-ORGi5005...,/clusterfs/jgi/groups/gentech/homes/amoralescr...
1469,Brassica_oleracea_TO1000DH3,AT5G67580,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,/clusterfs/jgi/groups/gentech/homes/amoralescr...


In [7]:
len(peak_files_annotated_df['tf'].unique())

244

In [8]:
list(peak_files_annotated_df['species'].unique())

['Arabidopsis_lyrata_MN47',
 'Arabidopsis_thaliana_Col-0',
 'Brassica_oleracea_TO1000DH3',
 'Capsella_rubella_Monte_Gargano']

In [9]:
def FilterAssigned(assigned, ALLOWED_REGIONS, MIN_PEAK_FOLDCH, MIN_PEAK_FOLDCH_FRAC_OF_BEST,
                   FILTER_DIST_START, MIN_DIST_START, MAX_DIST_START,
                   FILTER_DIST_END, MIN_DIST_END, MAX_DIST_END,
                   MAX_GENE_TARGETS, ONE_PEAK_PER_TARGET):
    
    assigned = assigned[assigned['annotated_peak_region'].isin(ALLOWED_REGIONS)]
    assigned = assigned[assigned['peak_foldch'] >= MIN_PEAK_FOLDCH]

    best_peak_foldch = assigned['peak_foldch'].max()
    min_peak_foldch_dynamic = MIN_PEAK_FOLDCH_FRAC_OF_BEST * best_peak_foldch
    assigned = assigned[assigned['peak_foldch'] >= min_peak_foldch_dynamic]
    
    if FILTER_DIST_START:
        assigned = assigned[assigned['summit_to_cds_start'] >= MIN_DIST_START]
        assigned = assigned[assigned['summit_to_cds_start'] <= MAX_DIST_START]
    if FILTER_DIST_END:
        assigned = assigned[assigned['summit_to_cds_end'] >= MIN_DIST_END]
        assigned = assigned[assigned['summit_to_cds_end'] <= MAX_DIST_END]
    
    if ONE_PEAK_PER_TARGET:
        assigned = assigned.sort_values(by='peak_foldch', ascending=False)
        assigned = assigned.drop_duplicates(subset=['feature_desc'], keep='first')
    
    assigned = assigned.sort_values(by='peak_foldch', ascending=False)
    assigned = assigned.head(MAX_GENE_TARGETS)
    
    return assigned

def GetGffDescValue(gene_desc, key):
    for tag in gene_desc.strip().split(';'):
        k, v = tag.split('=', maxsplit=1)
        if k == key:
            return v
        
def ExtractTargetIds(peaks, key):
    peaks['target_id'] = peaks['feature_desc'].apply(lambda x: GetGffDescValue(x, key))
    return peaks

def ReadFilterPeakFiles(peakfiles_group, species_order, peak_filter_params):
    peakfiles_group = peakfiles_group.set_index('species', drop=True)
    peakfiles_group = peakfiles_group.reindex(species_order).dropna()
        
    peaks_by_species = {}

    for species in peakfiles_group.index:
        peakfile_path = peakfiles_group.loc[species,'annotated_peak_file']
        peaks = pd.read_csv(peakfile_path, sep='\t')
        
        peaks = FilterAssigned(peaks, **peak_filter_params)
        
        peaks_by_species[species] = peaks

    return peaks_by_species

def AssignOrthogroupsToPeaksBySpecies(peaks_by_species, ortho_table_by_species, single_copy_only, organelle_orthogroups):
    peaks_orthogroups_by_species = {}
    for species, peaks in peaks_by_species.items():
        peaks_orthogroups = AssignOrthogroups(peaks, ortho_table_by_species, species, single_copy_only, organelle_orthogroups)
        # if single_copy_only:
        #     peaks_orthogroups = peaks_orthogroups[peaks_orthogroups['single_copy_ortho'] == True].reset_index(drop=True)
        peaks_orthogroups_by_species[species] = peaks_orthogroups
        
    return peaks_orthogroups_by_species
    
def AssignOrthogroups(peaks, ortho_table_by_species, species, single_copy_only, organelle_orthogroups):
    
    if species == 'Brassica_oleracea_TO1000DH3':
        ortho_table_key = 'gene'
        feature_desc_key = 'gene'
    else:
        ortho_table_key = 'protein_id'
        feature_desc_key = 'Name'
        
    peaks = ExtractTargetIds(peaks, feature_desc_key)
    
    ortho_table = ortho_table_by_species[species]
    
    if single_copy_only:
        ortho_table = ortho_table[ortho_table['single_copy_ortho'] == True]
    
    ### remove organelle orthos
    ortho_table = ortho_table[~ortho_table['orthogroup'].isin(organelle_orthogroups)]
    
    peaks = pd.merge(peaks, ortho_table, left_on='target_id', right_on=ortho_table_key, how='inner').reset_index(drop=True)

    return peaks

def FormatSpeciesName(sp):
    sp = sp.split('_')
    # return f"{sp[0][0]}. {' '.join(sp[1:])}"
    return f"{sp[0][0]}. {sp[1]}"

In [10]:
peak_filter_params = {
    'ALLOWED_REGIONS': ['upstream', 'utr5prime', 'cds', 'intron'],
    'MIN_PEAK_FOLDCH': 5,
    'MIN_PEAK_FOLDCH_FRAC_OF_BEST': 0,
    'FILTER_DIST_START': True,
    'MIN_DIST_START': -2000,
    'MAX_DIST_START': 500,
    'FILTER_DIST_END': False,
    'MIN_DIST_END': 0,
    'MAX_DIST_END': 0,
    'MAX_GENE_TARGETS': 10000000,
    'ONE_PEAK_PER_TARGET': False
}

species_order = ['Arabidopsis_thaliana_Col-0',
                 'Arabidopsis_lyrata_MN47',
                 'Capsella_rubella_Monte_Gargano',
                 'Brassica_oleracea_TO1000DH3']

primary_species = species_order[0]

single_copy_only = False
one_peak_per_orthogroup = False

In [11]:
filtered_assigned = []

for tf, group in peak_files_annotated_df.groupby('tf', sort=False):

    if primary_species not in group['species'].values:
        print(f'no peaks file for {primary_species} and tf {tf}. skipped.')
        continue

    peaks_by_species = ReadFilterPeakFiles(group, species_order, peak_filter_params)
    peaks_orthogroups_by_species = AssignOrthogroupsToPeaksBySpecies(peaks_by_species, ortho_table_by_species, single_copy_only, organelle_orthogroups)

    for sp, sp_peak_orthogroups in peaks_orthogroups_by_species.items():
        sp_peak_orthogroups['tf'] = tf
        filtered_assigned.append(sp_peak_orthogroups)
        
filtered_assigned = pd.concat(filtered_assigned)

if one_peak_per_orthogroup:
    ### only keep tallest peak if more than one gene in same species+orthogroup targeted
    filtered_assigned = filtered_assigned.sort_values(by=['species', 'tf', 'orthogroup', 'peak_foldch'])
    filtered_assigned = filtered_assigned.drop_duplicates(subset=['species', 'tf', 'orthogroup'], keep='last')
    
display(filtered_assigned)

Unnamed: 0,peak_chr,peak_start,peak_end,peak_name,peak_foldch,peak_pscore,peak_qscore,peak_summit,feature_chr,feature_source,...,relative_cds_end,summit_to_cds_start,summit_to_cds_end,target_id,species,orthogroup,protein_id,single_copy_ortho,gene,tf
0,Chr3,15645684,15646225,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,121.60200,1156.4600,1152.01000,268,Chr3,phytozomev12,...,0,-508,-1614,AT3G43750.1,Arabidopsis_thaliana_Col-0,OG0000207,AT3G43750.1,False,AT3G43750,AT1G01060
1,Chr1,1637949,1638542,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,115.54900,1085.1000,1080.68000,228,Chr1,phytozomev12,...,-172,-1171,-2530,AT1G05540.1,Arabidopsis_thaliana_Col-0,OG0004918,AT1G05540.1,False,AT1G05540,AT1G01060
2,Chr1,1637949,1638542,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,115.54900,1085.1000,1080.68000,228,Chr1,phytozomev12,...,-247,-314,-1681,AT1G05530.1,Arabidopsis_thaliana_Col-0,OG0002634,AT1G05530.1,False,AT1G05530,AT1G01060
3,Chr1,18204823,18205351,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,115.54900,1085.1000,1080.68000,268,Chr1,phytozomev12,...,0,-855,-1610,AT1G49220.1,Arabidopsis_thaliana_Col-0,OG0001402,AT1G49220.1,False,AT1G49220,AT1G01060
4,Chr1,17178696,17179222,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,110.41400,1025.1500,1020.76000,268,Chr1,phytozomev12,...,-367,-389,-1475,AT1G45474.2,Arabidopsis_thaliana_Col-0,OG0014575,AT1G45474.2,True,AT1G45474,AT1G01060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3722,NC_027756.1,27624733,27624923,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.01485,11.8384,9.76091,81,NC_027756.1,Gnomon,...,-137,-1401,-3302,LOC106318833,Brassica_oleracea_TO1000DH3,OG0007506,XP_013612425.1,False,LOC106318833,AT5G67580
3723,NC_027751.1,18968692,18968880,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.01485,11.8384,9.76091,96,NC_027751.1,Gnomon,...,0,-982,-4900,LOC106338305,Brassica_oleracea_TO1000DH3,OG0000023,XP_013632773.1,False,LOC106338305,AT5G67580
3724,NC_027749.1,36499261,36499408,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.01485,11.8384,9.76091,97,NC_027749.1,Gnomon,...,0,-1764,-2659,LOC106324123,Brassica_oleracea_TO1000DH3,OG0000025,XP_013617599.1,False,LOC106324123,AT5G67580
3725,NC_027749.1,48858022,48858179,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.00487,12.8623,10.77080,68,NC_027749.1,Gnomon,...,0,-1876,-2932,LOC106324487,Brassica_oleracea_TO1000DH3,OG0000151,XP_013617903.1,False,LOC106324487,AT5G67580


In [12]:
print(len(filtered_assigned['species'].unique()))

4


In [13]:
cons_gene_counts_by_og = filtered_assigned.groupby(['species', 'tf', 'orthogroup'])['gene'].agg('nunique')
# cons_gene_counts_by_og = cons_gene_counts_by_og.reset_index().set_index(['species', 'orthogroup'])
display(cons_gene_counts_by_og.sort_index())
cons_gene_frac_by_og = cons_gene_counts_by_og / orthogroup_sizes
cons_gene_frac_by_og = cons_gene_frac_by_og.dropna()
display(cons_gene_frac_by_og.sort_index())

species                         tf         orthogroup
Arabidopsis_lyrata_MN47         AT1G01060  OG0000003     1
                                           OG0000004     1
                                           OG0000007     1
                                           OG0000009     1
                                           OG0000014     1
                                                        ..
Capsella_rubella_Monte_Gargano  AT5G67580  OG0022871     1
                                           OG0022934     2
                                           OG0022939     1
                                           OG0022944     1
                                           OG0022949     1
Name: gene, Length: 4826609, dtype: int64

species                         orthogroup  tf       
Arabidopsis_lyrata_MN47         OG0000003   AT1G01060    0.5
                                            AT1G02230    0.5
                                            AT1G02250    0.5
                                            AT1G06280    0.5
                                            AT1G06850    0.5
                                                        ... 
Capsella_rubella_Monte_Gargano  OG0022951   AT5G64750    0.5
                                            AT5G65310    0.5
                                            AT5G65410    1.0
                                            AT5G66700    0.5
                                            AT5G66940    0.5
Name: gene, Length: 4826609, dtype: float64

In [14]:
cons_gene_frac_by_og = cons_gene_frac_by_og.rename('og_targeted_frac')
cons_gene_frac_by_og = cons_gene_frac_by_og.to_frame().reset_index()
display(cons_gene_frac_by_og)

Unnamed: 0,species,orthogroup,tf,og_targeted_frac
0,Arabidopsis_lyrata_MN47,OG0000003,AT1G01060,0.5
1,Arabidopsis_lyrata_MN47,OG0000003,AT1G02230,0.5
2,Arabidopsis_lyrata_MN47,OG0000003,AT1G02250,0.5
3,Arabidopsis_lyrata_MN47,OG0000003,AT1G06280,0.5
4,Arabidopsis_lyrata_MN47,OG0000003,AT1G06850,0.5
...,...,...,...,...
4826604,Capsella_rubella_Monte_Gargano,OG0022951,AT5G64750,0.5
4826605,Capsella_rubella_Monte_Gargano,OG0022951,AT5G65310,0.5
4826606,Capsella_rubella_Monte_Gargano,OG0022951,AT5G65410,1.0
4826607,Capsella_rubella_Monte_Gargano,OG0022951,AT5G66700,0.5


In [15]:
%%time

data = pd.merge(left=filtered_assigned, right=cons_gene_frac_by_og, on=['species', 'tf', 'orthogroup'], how='left')

dfs_to_concat = []
for tf, tf_group in data.groupby('tf'):
    for sp in species_order:
        sp_targets = tf_group[tf_group['species'] == sp]
        sp_targeted_frac_by_og = sp_targets.set_index('orthogroup')['og_targeted_frac'].to_dict()
        tf_group[f'cons_{sp}'] = tf_group['orthogroup'].map(sp_targeted_frac_by_og).fillna(0)
    dfs_to_concat.append(tf_group)

marked_cons = pd.concat(dfs_to_concat)


cons_col_names = [f'cons_{sp}' for sp in species_order]
for cons_frac_thresh in [0]:
    counted_cons_sp_col_name = f'n_cons_species_minfrac{cons_frac_thresh}'
    
    if cons_frac_thresh == 0:
        cons_above_thresh = marked_cons[cons_col_names] > cons_frac_thresh
    else:
        cons_above_thresh = marked_cons[cons_col_names] >= cons_frac_thresh

    marked_cons[counted_cons_sp_col_name] = cons_above_thresh.sum(axis='columns')

marked_cons['n_species_in_orthogroup'] = marked_cons['orthogroup'].map(n_species_by_orthogroup)

display(marked_cons)

Unnamed: 0,peak_chr,peak_start,peak_end,peak_name,peak_foldch,peak_pscore,peak_qscore,peak_summit,feature_chr,feature_source,...,single_copy_ortho,gene,tf,og_targeted_frac,cons_Arabidopsis_thaliana_Col-0,cons_Arabidopsis_lyrata_MN47,cons_Capsella_rubella_Monte_Gargano,cons_Brassica_oleracea_TO1000DH3,n_cons_species_minfrac0,n_species_in_orthogroup
0,Chr3,15645684,15646225,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,121.60200,1156.4600,1152.01000,268,Chr3,phytozomev12,...,False,AT3G43750,AT1G01060,0.857143,0.857143,0.285714,0.4,0.000000,3,4
1,Chr1,1637949,1638542,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,115.54900,1085.1000,1080.68000,228,Chr1,phytozomev12,...,False,AT1G05540,AT1G01060,1.000000,1.000000,0.000000,0.5,0.000000,2,4
2,Chr1,1637949,1638542,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,115.54900,1085.1000,1080.68000,228,Chr1,phytozomev12,...,False,AT1G05530,AT1G01060,0.500000,0.500000,0.000000,0.0,0.500000,2,4
3,Chr1,18204823,18205351,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,115.54900,1085.1000,1080.68000,268,Chr1,phytozomev12,...,False,AT1G49220,AT1G01060,0.666667,0.666667,0.666667,1.0,0.333333,4,4
4,Chr1,17178696,17179222,AT1G01060_A_thaliana_Col-0_DAPi709D10-ORGi5005...,110.41400,1025.1500,1020.76000,268,Chr1,phytozomev12,...,True,AT1G45474,AT1G01060,1.000000,1.000000,0.000000,1.0,1.000000,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6911388,NC_027756.1,27624733,27624923,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.01485,11.8384,9.76091,81,NC_027756.1,Gnomon,...,False,LOC106318833,AT5G67580,0.500000,1.000000,1.000000,0.0,0.500000,3,4
6911389,NC_027751.1,18968692,18968880,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.01485,11.8384,9.76091,96,NC_027751.1,Gnomon,...,False,LOC106338305,AT5G67580,0.062500,0.000000,0.000000,0.0,0.062500,1,1
6911390,NC_027749.1,36499261,36499408,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.01485,11.8384,9.76091,97,NC_027749.1,Gnomon,...,False,LOC106324123,AT5G67580,0.156250,0.000000,0.000000,0.0,0.156250,1,1
6911391,NC_027749.1,48858022,48858179,AT5G67580_B_oleracea_TO1000DH3_DAPi711D05-ORGi...,5.00487,12.8623,10.77080,68,NC_027749.1,Gnomon,...,False,LOC106324487,AT5G67580,0.083333,0.000000,0.000000,0.0,0.083333,1,1


CPU times: user 19.4 s, sys: 1.21 s, total: 20.7 s
Wall time: 20.7 s


In [17]:
len(marked_cons['tf'].unique())

244

In [18]:
marked_cons.groupby('species')['n_cons_species_minfrac0'].mean()


species
Arabidopsis_lyrata_MN47           2.538890
Arabidopsis_thaliana_Col-0        2.638799
Brassica_oleracea_TO1000DH3       2.164445
Capsella_rubella_Monte_Gargano    2.537400
Name: n_cons_species_minfrac0, dtype: float64

In [19]:
outfile_name = 'N4_filtered-annotated-peaks_minfoldch5_minus-2000bp-to-plus-500bp_111623.tsv'
outdir = '../'

print(outfile_name)
marked_cons.to_csv(os.path.join(outdir, outfile_name), sep='\t', index=False)

N4_filtered-annotated-peaks_minfoldch5_minus-2000bp-to-plus-500bp_111623.tsv
