### Goal

Test for overrepresentation of protein domains in genes associated with high number of CNEs (defined by associating each CNE to its closest gene).

### Input

- closest_gene_counts_by_node.tsv: number of CNEs associated with (closest to) each gene, separated by node of origin (generated with find_closest_gene.py)
- gene_IPR_dict.pickle: dictionary of protein domains for each gene (generated with find_closest_gene.py)

### Output

- gene_IPR.tsv: dataframe of gene-Interpro domain links
- all_species_overrep_domains.tsv: Results of Fisher test for all species, all protein domains
- summary_by_sp.tsv: short summary of results
- sp_counts.tsv: overrepresented domains ranked by number of species
- summary_by_node.tsv: short summary of results for analysis run separately for CNEs of each node
- overrep_domains_node_count.tsv: overrepresented domains ranked by number of nodes
- homeo_pvalues.tsv: pvalues for homeodomain only, for downstream analyses.

In [1]:
import sys
import scipy.stats as stats
import pandas as pd
import glob
import statistics
import collections
import pickle

#### Read table of CNE counts per gene

In [2]:
closest_genes = "../../find_closest_gene/new_parse_gff/closest_gene_counts_by_node.tsv"
closest_genes_df = pd.read_csv(closest_genes, sep="\t")
closest_genes_df.columns.values[1] = 'gene_id'
# Exclude cnes without genes on same scaffold
closest_genes_df = closest_genes_df[closest_genes_df['gene_id'] != 'no_gene']
closest_genes_df

Unnamed: 0,species,gene_id,closest_cne_count,cne_node
0,spis,gene-LOC111326177,2,hexacorallia
1,spis,gene-LOC111327133,1,hexacorallia
2,spis,gene-LOC111328643,1,hexacorallia
3,spis,gene-LOC111331958,1,hexacorallia
4,spis,gene-LOC111325339,1,hexacorallia
...,...,...,...,...
138947,nvec,gene-LOC5521942,1,ambiguous
138948,nvec,gene-LOC5521953,1,ambiguous
138949,nvec,gene-LOC5522147,2,ambiguous
138950,nvec,gene-LOC116604813,1,ambiguous


In [7]:
closest_genes_df.sort_values('closest_cne_count', ascending=False)

Unnamed: 0,species,gene_id,closest_cne_count,cne_node
27791,spis,gene-LOC111323368,89,pocilloporidae
129102,pdam,gene-LOC113673894,87,pocilloporidae
34597,spis,gene-LOC111335778,74,pocilloporidae
31787,spis,gene-LOC111330647,65,pocilloporidae
127491,pdam,gene-LOC113671074,62,pocilloporidae
...,...,...,...,...
61377,ofav,gene-LOC110058478,1,cnidaria
61375,ofav,gene-LOC110058338,1,cnidaria
61374,ofav,gene-LOC110058297,1,cnidaria
61373,ofav,gene-LOC110058202,1,cnidaria


#### Read protein domain information

In [3]:
gene_IPR_file = "../../find_closest_gene/new_parse_gff/gene_IPR_dict.pickle"

In [4]:
with open(gene_IPR_file, "rb") as infile:
    gene_IPR_dict = pickle.load(infile)

In [5]:
for species, gene_dict in gene_IPR_dict.items():
    print(species, len(gene_dict))

spis 24846
hsym 22022
aaur 30167
dgig 22045
chem 45872
ofav 25929
aten 19980
mvir 24278
hvul 20058
adig 26073
epal 22509
pdam 19935
nvec 23845


### Retrieve IPR descriptions
 If IPR description file does not exist

In [11]:
ipr_desc = {}
all_IPRS_desc = pd.DataFrame()
for file in interpro_files:
    print(file)
    species = file.split("/")[-1].split("_")[0]
    interpro_results = pd.read_table(file, names = ('gene_id', 'identifier', 'length', 'software', 
                                                         'software_id', 'software_prediction', 'start', 'end',
                                                         'score', 'status', 'date', 'IPR_id', 'description'))
    for idx, row in interpro_results.iterrows():
        IPR_id = row['IPR_id']
        description = row['description']
        ipr_desc[IPR_id] = description
all_IPRS_desc = pd.DataFrame(ipr_desc.items(), columns=['IPR_id','description'])
all_IPRS_desc = all_IPRS_desc.append(ipr_desc_df)
all_IPRS_desc = all_IPRS_desc.drop_duplicates('IPR_id')
all_IPRS_desc 

In [12]:
all_IPRS_desc

In [13]:
all_IPRS_desc.to_csv("all_IPR_desc.tsv", sep="\t", index=False)

#### Read file of IPR descriptions if it already exists

In [6]:
all_IPRS_desc = pd.read_csv("../avg/with_conv_filt/all_IPR_desc.tsv", sep="\t")
all_IPRS_desc

Unnamed: 0,IPR_id,description
0,,
1,IPR002181,"Fibrinogen, alpha/beta/gamma chain, C-terminal..."
2,IPR036056,"Fibrinogen-like, C-terminal"
3,IPR000885,"Fibrillar collagen, C-terminal"
4,IPR002110,Ankyrin repeat
...,...,...
14329,IPR003566,T-cell surface glycoprotein CD5
14330,IPR037004,"Exonuclease VII, small subunit superfamily"
14331,IPR004124,"Glycoside hydrolase, family 33, N-terminal"
14332,IPR012480,Heparinase II/III-like


### Make dataframe of gene_IPRs (one IPR per row)

In [10]:
gene_IPR_df = pd.DataFrame(columns=['species', 'gene_id', 'IPR_id'])
for species, gene_dict in gene_IPR_dict.items():
    print(species)
    sp_df = pd.DataFrame(columns=['gene_id', 'IPR_id'])
    for gene_id, IPR_list in gene_dict.items():
        if len(IPR_list)>0:
            for IPR in IPR_list:
                row = [gene_id, IPR]
                sp_df.loc[len(sp_df)] = row
        else:
            row = [gene_id, 'no_IPR']
            sp_df.loc[len(sp_df)] = row
    sp_df['species'] = species
    gene_IPR_df = pd.concat([gene_IPR_df, sp_df], axis=0)

spis
hsym
aaur
dgig
chem
ofav
aten
mvir
hvul
adig
epal
pdam
nvec


In [11]:
#gene_IPR_df.to_csv("gene_IPR.tsv", sep="\t", index=False)

In [7]:
# Read if file already exists
gene_IPR_df = pd.read_csv("gene_IPR.tsv", sep="\t")

In [9]:
gene_IPR_df[gene_IPR_df['species']=='chem']

Unnamed: 0,species,gene_id,IPR_id
348572,chem,XLOC_000078,IPR016082
348573,chem,XLOC_000078,IPR035808
348574,chem,XLOC_000078,IPR039699
348575,chem,XLOC_000078,IPR018038
348576,chem,XLOC_000078,IPR005998
...,...,...,...
427911,chem,XLOC_045868,no_IPR
427912,chem,XLOC_045870,no_IPR
427913,chem,XLOC_045869,no_IPR
427914,chem,XLOC_045871,no_IPR


### Run Fisher tests

In [12]:
combined_output_df = pd.DataFrame()
summary_by_sp = pd.DataFrame(columns=['species', 'cne_count', 'gene_count', 'cne_threshold', 
                                      'num_IPR_tested', 'num_sig_IPRs', 'num_homeo'])
sp_list = list(set(closest_genes_df['species']))
print(sp_list)
for species in sp_list:
    print(species)
    sp_df = closest_genes_df[closest_genes_df['species'] == species]
    sp_df = sp_df.groupby('gene_id').sum().reset_index() 
    merged_sp_df = gene_IPR_df[gene_IPR_df['species'] == species].merge(sp_df, how='left')
    merged_sp_df['closest_cne_count'] = merged_sp_df['closest_cne_count'].fillna(value=0)
    total_genes = len(gene_IPR_df[gene_IPR_df['species'] == species]['gene_id'].unique())
    total_cnes = sum(sp_df['closest_cne_count'])
    cne_threshold = total_cnes/total_genes # mean cne count per gene in genome
    with_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] > cne_threshold]
    without_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] <= cne_threshold]
    gene_count_wCNE = len(with_CNE_df['gene_id'].unique())
    gene_count_noCNE = len(without_CNE_df['gene_id'].unique())
    test_IPRs = list(with_CNE_df['IPR_id'].unique())
    print("Testing: ", len(test_IPRs), "distinct IPRs")
    print("cne threshold:", cne_threshold)
    pvalues = []
    for test_IPR in test_IPRs:
        #print(IPR_id)
        # genes with high CNEs and containing protein domain
        gene_count_wCNE_wIPR = len(with_CNE_df[with_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        # genes with high CNEs not containing protein domain
        gene_count_wCNE_noIPR = gene_count_wCNE - gene_count_wCNE_wIPR
        # genes with low CNEs and containing protein domain
        gene_count_noCNE_wIPR = len(without_CNE_df[without_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        # genes with low CNEs not containing protein domain
        gene_count_noCNE_noIPR = gene_count_noCNE - gene_count_noCNE_wIPR
        # Make contingency table for Fisher test
        cont_matrix = [[gene_count_wCNE_wIPR, gene_count_wCNE_noIPR], [gene_count_noCNE_wIPR, gene_count_noCNE_noIPR]]
        # Run test
        oddsratio, pvalue = stats.fisher_exact(cont_matrix, alternative = 'greater')
        pvalues.append(pvalue)
    # Bonferroni pvalue corrrection
    padj = [pval * len(pvalues) for pval in pvalues]
    table = {'IPR_id':test_IPRs, 'pvalue':pvalues, 'corrected pvalue':padj}
    pval_df = pd.DataFrame(table)
    pval_df = pval_df.sort_values(by = ['corrected pvalue'])
    pval_df = pval_df.reset_index(drop = True)
    pval_df = pval_df[pval_df['corrected pvalue'] <= 0.05]
    print("Number of significant IPRs: ", len(pval_df))
    pval_df = pval_df.merge(all_IPRS_desc).sort_values('corrected pvalue')
    pval_df['species'] = species
    num_homeo = pval_df["description"].str.contains('Homeo').value_counts().get(True, 0)
    summary_row = [species, total_cnes, total_genes, cne_threshold, len(test_IPRs), len(pval_df), num_homeo]
    summary_by_sp.loc[len(summary_by_sp)] = summary_row
    combined_output_df = combined_output_df.append(pval_df)
combined_output_df = combined_output_df.merge(all_IPRS_desc).sort_values(['species', 'corrected pvalue'])
overrep_out_file = "all_species_overrep_domains.tsv"
summary_out_file = 'summary_by_sp.tsv'
print("Writing output files")
combined_output_df.to_csv(overrep_out_file, sep="\t", index=False)
summary_by_sp.to_csv(summary_out_file, sep="\t", index=False)

['hsym', 'hvul', 'pdam', 'ofav', 'epal', 'mvir', 'spis', 'chem', 'dgig', 'nvec', 'aaur', 'adig', 'aten']
hsym
Testing:  4375 distinct IPRs
cne threshold: 0.29347924802470254
Number of significant IPRs:  21
hvul


  combined_output_df = combined_output_df.append(pval_df)


Testing:  2064 distinct IPRs
cne threshold: 0.10115664572739057
Number of significant IPRs:  0
pdam
Testing:  4895 distinct IPRs
cne threshold: 4.083872585904189


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  76
ofav


  combined_output_df = combined_output_df.append(pval_df)


Testing:  5081 distinct IPRs
cne threshold: 1.6517027266766942
Number of significant IPRs:  22
epal
Testing:  2859 distinct IPRs
cne threshold: 0.22328846239282066


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  13
mvir
Testing:  2480 distinct IPRs
cne threshold: 0.10931707718922481


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  6
spis


  combined_output_df = combined_output_df.append(pval_df)


Testing:  5348 distinct IPRs
cne threshold: 4.288295902761008
Number of significant IPRs:  71
chem


  combined_output_df = combined_output_df.append(pval_df)


Testing:  1505 distinct IPRs
cne threshold: 0.04989972096267876
Number of significant IPRs:  1
dgig
Testing:  1509 distinct IPRs
cne threshold: 0.16157858924926288


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  22
nvec


  combined_output_df = combined_output_df.append(pval_df)


Testing:  2590 distinct IPRs
cne threshold: 0.14556510798909625
Number of significant IPRs:  4
aaur
Testing:  1601 distinct IPRs
cne threshold: 0.06659594921603076


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  1
adig


  combined_output_df = combined_output_df.append(pval_df)


Testing:  4224 distinct IPRs
cne threshold: 1.9154681087715262
Number of significant IPRs:  56
aten
Testing:  3539 distinct IPRs
cne threshold: 0.26306306306306304


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  22
Writing output files


  combined_output_df = combined_output_df.append(pval_df)


In [13]:
### Count number of species for each significant IPR domain
sp_count = combined_output_df[['IPR_id', 'species']].groupby('IPR_id').count().reset_index()
sp_count = sp_count.sort_values('species', ascending=False)
sp_count = sp_count.merge(all_IPRS_desc)
sp_count

Unnamed: 0,IPR_id,species,description
0,IPR013783,7,Immunoglobulin-like fold
1,IPR000742,6,EGF-like domain
2,IPR012337,6,Ribonuclease H-like superfamily
3,IPR013032,6,"EGF-like, conserved site"
4,IPR009057,5,Homeobox-like domain superfamily
...,...,...,...
143,IPR011029,1,Death-like domain superfamily
144,IPR009017,1,Green fluorescent protein
145,IPR008984,1,SMAD/FHA domain superfamily
146,IPR008936,1,Rho GTPase activation protein


In [16]:
sp_count.to_csv('sp_counts_avg.tsv', sep="\t", index=False)

### Use mean + std_dev as threshold

In [17]:
import os

In [18]:
out_dir = "avg_stdev/"
os.mkdir(out_dir)
combined_output_df = pd.DataFrame()
summary_by_sp = pd.DataFrame(columns=['species', 'cne_count', 'gene_count', 'cne_threshold', 
                                      'num_IPR_tested', 'num_sig_IPRs', 'num_homeo'])
sp_list = list(set(closest_genes_df['species']))
print(sp_list)
for species in sp_list:
    print(species)
    sp_df = closest_genes_df[closest_genes_df['species'] == species]
    sp_df = sp_df.groupby('gene_id').sum().reset_index() 
    merged_sp_df = gene_IPR_df[gene_IPR_df['species'] == species].merge(sp_df, how='left')
    merged_sp_df['closest_cne_count'] = merged_sp_df['closest_cne_count'].fillna(value=0)
    #total_genes = len(gene_IPR_df[gene_IPR_df['species'] == species]['gene_id'].unique())
    #total_cnes = sum(sp_df['closest_cne_count'])
    mean_cne = statistics.mean(list(sp_df['closest_cne_count']))
    stdev_cne =  statistics.stdev(list(sp_df['closest_cne_count']))
    cne_threshold = mean_cne + stdev_cne
    #cne_threshold = total_cnes/total_genes # mean cne count per gene in genome
    with_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] > cne_threshold]
    # genes with low CNEs not containing protein domain
    without_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] <= cne_threshold]
    gene_count_wCNE = len(with_CNE_df['gene_id'].unique())
    gene_count_noCNE = len(without_CNE_df['gene_id'].unique())
    test_IPRs = list(with_CNE_df['IPR_id'].unique())
    print("Testing: ", len(test_IPRs), "distinct IPRs")
    print("cne threshold:", cne_threshold)
    pvalues = []
    for test_IPR in test_IPRs:
        #print(IPR_id)
        # genes with high CNEs and containing protein domain
        gene_count_wCNE_wIPR = len(with_CNE_df[with_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        # genes with high CNEs not containing protein domain
        gene_count_wCNE_noIPR = gene_count_wCNE - gene_count_wCNE_wIPR
        # genes with low CNEs and containing protein domain
        gene_count_noCNE_wIPR = len(without_CNE_df[without_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        gene_count_noCNE_noIPR = gene_count_noCNE - gene_count_noCNE_wIPR
        # Make contingency table for Fisher test
        cont_matrix = [[gene_count_wCNE_wIPR, gene_count_wCNE_noIPR], [gene_count_noCNE_wIPR, gene_count_noCNE_noIPR]]
        # Run test
        oddsratio, pvalue = stats.fisher_exact(cont_matrix, alternative = 'greater')
        pvalues.append(pvalue)
    # Bonferroni pvalue corrrection
    padj = [pval * len(pvalues) for pval in pvalues]
    table = {'IPR_id':test_IPRs, 'pvalue':pvalues, 'corrected pvalue':padj}
    pval_df = pd.DataFrame(table)
    pval_df = pval_df.sort_values(by = ['corrected pvalue'])
    pval_df = pval_df.reset_index(drop = True)
    pval_df = pval_df[pval_df['corrected pvalue'] <= 0.05]
    ##fisher_results_df = pd.DataFrame(fisher_results.items(), columns=['IPR_id', 'p-value'])
    print("Number of significant IPRs: ", len(pval_df))
    pval_df = pval_df.merge(all_IPRS_desc).sort_values('corrected pvalue')
    pval_df['species'] = species
    num_homeo = pval_df["description"].str.contains('Homeo').value_counts().get(True, 0)
    summary_row = [species, total_cnes, total_genes, cne_threshold, len(test_IPRs), len(pval_df), num_homeo]
    summary_by_sp.loc[len(summary_by_sp)] = summary_row
    combined_output_df = combined_output_df.append(pval_df)
combined_output_df = combined_output_df.merge(all_IPRS_desc).sort_values(['species', 'corrected pvalue'])
### Count number of species for each significant IPR domain
sp_count = combined_output_df[['IPR_id', 'species']].groupby('IPR_id').count().reset_index()
sp_count = sp_count.sort_values('species', ascending=False)
sp_count = sp_count.merge(all_IPRS_desc)
overrep_out_file = out_dir + "all_species_overrep_domains.tsv"
summary_out_file = out_dir + 'summary_by_sp.tsv'
sp_count_out_file = out_dir + 'sp_counts.tsv' 
print("Writing output files")
combined_output_df.to_csv(overrep_out_file, sep="\t", index=False)
summary_by_sp.to_csv(summary_out_file, sep="\t", index=False)
sp_count.to_csv(sp_count_out_file, sep="\t", index=False)

['hsym', 'hvul', 'pdam', 'ofav', 'epal', 'mvir', 'spis', 'chem', 'dgig', 'nvec', 'aaur', 'adig', 'aten']
hsym
Testing:  1054 distinct IPRs
cne threshold: 2.647269734280448
Number of significant IPRs:  2
hvul
Testing:  452 distinct IPRs
cne threshold: 1.648064144702686


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  2
pdam
Testing:  1843 distinct IPRs
cne threshold: 11.065956549206742


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  42
ofav
Testing:  1306 distinct IPRs
cne threshold: 7.577158735625564


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  10
epal
Testing:  530 distinct IPRs
cne threshold: 2.7894647174250355


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  9
mvir
Testing:  249 distinct IPRs
cne threshold: 2.2494988865182712


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  1
spis
Testing:  2095 distinct IPRs
cne threshold: 11.265647422038942


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  63
chem
Testing:  421 distinct IPRs
cne threshold: 1.8311935437612146


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  3
dgig
Testing:  342 distinct IPRs
cne threshold: 2.7892938989639564


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  7
nvec
Testing:  383 distinct IPRs
cne threshold: 2.838551148889957


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  2
aaur
Testing:  401 distinct IPRs
cne threshold: 1.6163622406187954


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  1
adig
Testing:  1053 distinct IPRs
cne threshold: 7.976278949160756


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  32
aten
Testing:  646 distinct IPRs
cne threshold: 2.5386674499418076


  combined_output_df = combined_output_df.append(pval_df)


Number of significant IPRs:  3
Writing output files


  combined_output_df = combined_output_df.append(pval_df)


In [19]:
sp_count

Unnamed: 0,IPR_id,species,description
0,IPR012337,5,Ribonuclease H-like superfamily
1,IPR009057,4,Homeobox-like domain superfamily
2,IPR001584,4,"Integrase, catalytic core"
3,IPR013783,3,Immunoglobulin-like fold
4,IPR001356,3,Homeobox domain
...,...,...,...
106,IPR019080,1,YqaJ viral recombinase
107,IPR002219,1,"Protein kinase C-like, phorbol ester/diacylgly..."
108,IPR018161,1,"Wnt protein, conserved site"
109,IPR018159,1,Spectrin/alpha-actinin


### Run tests separately for CNEs of each node of origin

In [16]:
out_dir = "by_node_avg/"
os.mkdir(out_dir)
summary_by_node = pd.DataFrame(columns=['node', 'species', 'cne_count', 'gene_count', 'cne_threshold', 
                                      'num_IPR_tested', 'num_sig_IPRs', 'num_homeo'])
all_nodes_output_df = pd.DataFrame()
for node in list(set(closest_genes_df['cne_node'])):
    print(node)
    node_out_df = pd.DataFrame()
    node_cne_count_df = closest_genes_df[closest_genes_df['cne_node'] == node]
    node_sp_list = list(set(node_cne_count_df['species']))
    print(node_sp_list)
    for species in node_sp_list:
        print(species)
        node_sp_df = node_cne_count_df[node_cne_count_df['species'] == species]
        node_sp_df = node_sp_df.groupby('gene_id').sum().reset_index() 
        merged_sp_df = gene_IPR_df[gene_IPR_df['species'] == species].merge(node_sp_df, how='left')
        merged_sp_df['closest_cne_count'] = merged_sp_df['closest_cne_count'].fillna(value=0)
        total_genes = len(gene_IPR_df[gene_IPR_df['species'] == species]['gene_id'].unique())
        total_cnes = sum(node_sp_df['closest_cne_count'])
        mean_cne = statistics.mean(list(node_sp_df['closest_cne_count']))
        #stdev_cne =  statistics.stdev(list(node_sp_df['closest_cne_count']))
        cne_threshold = mean_cne #+ stdev_cne
        with_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] > cne_threshold]
        without_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] <= cne_threshold]
        gene_count_wCNE = len(with_CNE_df['gene_id'].unique())
        gene_count_noCNE = len(without_CNE_df['gene_id'].unique())
        test_IPRs = list(with_CNE_df['IPR_id'].unique())
        print("Testing: ", len(test_IPRs), "distinct IPRs")
        print("cne threshold:", cne_threshold)
        pvalues = []
        for test_IPR in test_IPRs:
            gene_count_wCNE_wIPR = len(with_CNE_df[with_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
            gene_count_wCNE_noIPR = gene_count_wCNE - gene_count_wCNE_wIPR
            gene_count_noCNE_wIPR = len(without_CNE_df[without_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
            gene_count_noCNE_noIPR = gene_count_noCNE - gene_count_noCNE_wIPR
            # Make contingency table for Fisher test
            cont_matrix = [[gene_count_wCNE_wIPR, gene_count_wCNE_noIPR], [gene_count_noCNE_wIPR, gene_count_noCNE_noIPR]]
            # Run test
            oddsratio, pvalue = stats.fisher_exact(cont_matrix, alternative = 'greater')
            pvalues.append(pvalue)
        padj = [pval * len(pvalues) for pval in pvalues]
        table = {'IPR_id':test_IPRs, 'pvalue':pvalues, 'corrected pvalue':padj}
        pval_df = pd.DataFrame(table)
        pval_df = pval_df.sort_values(by = ['corrected pvalue'])
        pval_df = pval_df.reset_index(drop = True)
        pval_df = pval_df[pval_df['corrected pvalue'] <= 0.05]
        pval_df['node'] = node
        pval_df['species'] = species
        pval_df = pval_df.merge(all_IPRS_desc)
        node_out_df = node_out_df.append(pval_df)
        summary_stats = [total_cnes, total_genes, cne_threshold, len(test_IPRs), len(pval_df)]
        num_homeo = pval_df["description"].str.contains('Homeo').value_counts().get(True, 0)
        summary_row = [node, species] + summary_stats + [num_homeo]
        summary_by_node.loc[len(summary_by_node)] = summary_row
    if len(node_out_df)>0:
        all_nodes_output_df = all_nodes_output_df.append(node_out_df)
        out_file = out_dir + node + "_overrep_domains.tsv"
        print("Writing file: ", out_file)
        node_out_df.to_csv(out_file, sep="\t", index=False)
    else:
        print("This node has no significant IPRs")
summary_by_node.to_csv(out_dir + 'summary_by_node.tsv', sep="\t", index=False)   

NameError: name 'os' is not defined

In [21]:
all_nodes_output_df

Unnamed: 0,IPR_id,pvalue,corrected pvalue,node,species,description
0,IPR018972,8.627736e-04,0.035374,medusozoa,hsym,Sas10 C-terminal domain
1,IPR011274,8.627736e-04,0.035374,medusozoa,hsym,"Malate dehydrogenase, NAD-dependent, cytosolic"
0,IPR039039,9.971084e-05,0.000100,medusozoa,hvul,RAI1-like family
0,IPR004114,5.354642e-04,0.019277,medusozoa,mvir,THUMP domain
1,IPR032157,5.354642e-04,0.019277,medusozoa,mvir,Proteasome assembly chaperone 4
...,...,...,...,...,...,...
7,IPR009017,6.778604e-08,0.000140,scleractinia,pdam,Green fluorescent protein
8,IPR011584,8.675076e-08,0.000179,scleractinia,pdam,Green fluorescent protein-related
9,IPR000152,8.925305e-08,0.000184,scleractinia,pdam,EGF-type aspartate/asparagine hydroxylation site
10,IPR020479,1.854090e-06,0.003819,scleractinia,pdam,"Homeobox domain, metazoa"


In [22]:
out_dir

'by_node_avg/'

In [23]:
all_nodes_output_df.to_csv(out_dir + "all_nodes_output_df.tsv", sep="\t", index=False)

#### Aggregate results by number of nodes

In [24]:
node_count = all_nodes_output_df.drop_duplicates(['node', 'IPR_id'])[['IPR_id', 'node']].\
    groupby('IPR_id').count().reset_index().sort_values('node', ascending=False).rename({'node':'node_count'}, axis=1)
node_count = node_count.merge(all_IPRS_desc)
node_count

Unnamed: 0,IPR_id,node_count,description
0,IPR012337,6,Ribonuclease H-like superfamily
1,IPR007110,6,Immunoglobulin-like domain
2,IPR036179,6,Immunoglobulin-like domain superfamily
3,IPR003598,6,Immunoglobulin subtype 2
4,IPR036397,5,Ribonuclease H superfamily
...,...,...,...
225,IPR016035,1,Acyl transferase/acyl hydrolase/lysophospholipase
226,IPR016072,1,"SKP1 component, dimerisation"
227,IPR016255,1,eIF-2-alpha kinase Gcn2
228,IPR016897,1,S-phase kinase-associated protein 1


In [25]:
node_count.to_csv("overrep_domains_node_count.tsv", sep="\t", index=False)

### Run test with homeodomain only (for plotting pvalue of homeodomain vs number of CNEs)

In [2]:
homeo_IPRids = ['IPR009057', 'IPR017970', 'IPR001356', 'IPR020479', 'IPR008422', 'IPR032967',
               'IPR032453', 'IPR000747' ] 

In [24]:
combined_output_df = pd.DataFrame()
summary_by_sp = pd.DataFrame(columns=['species', 'cne_count', 'gene_count', 'cne_threshold', 
                                      'num_IPR_tested', 'num_sig_IPRs', 'num_homeo'])
sp_list = list(set(closest_genes_df['species']))
print(sp_list)
for species in sp_list:
    print(species)
    sp_df = closest_genes_df[closest_genes_df['species'] == species]
    sp_df = sp_df.groupby('gene_id').sum().reset_index() 
    merged_sp_df = gene_IPR_df[gene_IPR_df['species'] == species].merge(sp_df, how='left')
    merged_sp_df['closest_cne_count'] = merged_sp_df['closest_cne_count'].fillna(value=0)
    total_genes = len(gene_IPR_df[gene_IPR_df['species'] == species]['gene_id'].unique())
    total_cnes = sum(sp_df['closest_cne_count'])
    cne_threshold = total_cnes/total_genes # mean cne count per gene in genome
    with_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] > cne_threshold]
    without_CNE_df = merged_sp_df[merged_sp_df['closest_cne_count'] <= cne_threshold]
    gene_count_wCNE = len(with_CNE_df['gene_id'].unique())
    gene_count_noCNE = len(without_CNE_df['gene_id'].unique())
    # Only test homeodomain IDs
    test_IPRs = homeo_IPRids
    print("Testing: ", len(test_IPRs), "distinct IPRs")
    print("cne threshold:", cne_threshold)
    pvalues = []
    for test_IPR in test_IPRs:
        gene_count_wCNE_wIPR = len(with_CNE_df[with_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        gene_count_wCNE_noIPR = gene_count_wCNE - gene_count_wCNE_wIPR
        gene_count_noCNE_wIPR = len(without_CNE_df[without_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        gene_count_noCNE_noIPR = gene_count_noCNE - gene_count_noCNE_wIPR
        # Make contingency table for Fisher test
        cont_matrix = [[gene_count_wCNE_wIPR, gene_count_wCNE_noIPR], [gene_count_noCNE_wIPR, gene_count_noCNE_noIPR]]
        # Run test
        oddsratio, pvalue = stats.fisher_exact(cont_matrix, alternative = 'greater')
        pvalues.append(pvalue)
    padj = [pval * len(pvalues) for pval in pvalues]
    table = {'IPR_id':test_IPRs, 'pvalue':pvalues, 'corrected pvalue':padj}
    pval_df = pd.DataFrame(table)
    pval_df = pval_df.sort_values(by = ['corrected pvalue'])
    pval_df = pval_df.reset_index(drop = True)
    print("Number of significant IPRs: ", len(pval_df))
    pval_df = pval_df.merge(all_IPRS_desc).sort_values('corrected pvalue')
    pval_df['species'] = species
    num_homeo = pval_df["description"].str.contains('Homeo').value_counts().get(True, 0)
    summary_row = [species, total_cnes, total_genes, cne_threshold, len(test_IPRs), len(pval_df), num_homeo]
    summary_by_sp.loc[len(summary_by_sp)] = summary_row
    combined_output_df = combined_output_df.append(pval_df)
combined_output_df = combined_output_df.merge(all_IPRS_desc).sort_values(['species', 'corrected pvalue'])

['pdam', 'aten', 'spis', 'dgig', 'mvir', 'hsym', 'ofav', 'nvec', 'epal', 'chem', 'hvul', 'adig', 'aaur']
pdam
Testing:  8 distinct IPRs
cne threshold: 4.083872585904189
Number of significant IPRs:  8
aten
Testing:  8 distinct IPRs
cne threshold: 0.26306306306306304
Number of significant IPRs:  8
spis
Testing:  8 distinct IPRs
cne threshold: 4.288295902761008
Number of significant IPRs:  8
dgig
Testing:  8 distinct IPRs
cne threshold: 0.16157858924926288
Number of significant IPRs:  8
mvir
Testing:  8 distinct IPRs
cne threshold: 0.10931707718922481
Number of significant IPRs:  8
hsym
Testing:  8 distinct IPRs
cne threshold: 0.29347924802470254
Number of significant IPRs:  8
ofav
Testing:  8 distinct IPRs
cne threshold: 1.6517027266766942
Number of significant IPRs:  8
nvec
Testing:  8 distinct IPRs
cne threshold: 0.14556510798909625
Number of significant IPRs:  8
epal
Testing:  8 distinct IPRs
cne threshold: 0.22328846239282066
Number of significant IPRs:  8
chem
Testing:  8 distinct I

In [25]:
combined_output_df

Unnamed: 0,IPR_id,pvalue,corrected pvalue,description,species
38,IPR009057,1.038193e-02,8.305545e-02,Homeobox-like domain superfamily,aaur
25,IPR001356,4.027438e-02,3.221950e-01,Homeobox domain,aaur
12,IPR017970,7.713947e-02,6.171157e-01,"Homeobox, conserved site",aaur
51,IPR020479,5.628509e-01,4.502807e+00,"Homeobox domain, metazoa",aaur
64,IPR008422,1.000000e+00,8.000000e+00,Homeobox KN domain,aaur
...,...,...,...,...,...
28,IPR009057,4.607072e-21,3.685658e-20,Homeobox-like domain superfamily,spis
41,IPR020479,1.075390e-16,8.603118e-16,"Homeobox domain, metazoa",spis
54,IPR008422,2.992573e-01,2.394059e+00,Homeobox KN domain,spis
67,IPR032967,3.229494e-01,2.583595e+00,Retinal homeobox protein Rx-like,spis


In [26]:
combined_output_df.to_csv('homeo_pvalues.tsv', sep="\t", index=False)