### Goal

Intersect results of protein domain overrepresentation between different phyla and plot corresponding Venn diagrams.

### Input

- all_IPRs_curated.txt: list of protein domains IDs and descriptions annotated manually with class info (homeodomain, other transcription factor, TE-related etc.)
- sp_counts.tsv: overrepresented domains ranked by number of species (generated with overrep_domains_closest_gene.py)


### Output

- IPR_venn_avg.pdf:  Venn diagram plot
- inter_domains_avg.tsv: List of domains overrepresented in all three phyla
- inter_cnid_moll_avg.tsv: List of domains overrepresented in both Cnidaria and Mollusca
- inter_arth_moll_avg.tsv: List of domains overrepresented in both Arthropoda and Mollusca
- inter_arth_cnid_avg.tsv: List of domains overrepresented in both Arthropoda and Cnidaria
- phylum + '_only_avg.tsv' : List of domains overrepresented only in phylum

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

In [49]:
curated_IPRs = "../../../combine_overrep_analyses/all_IPRs_curated.txt"
IPR_annot_df = pd.read_csv(curated_IPRs, sep="\t")
IPR_annot_df

Unnamed: 0,IPR_id,description,class
0,IPR029034,Cystine-knot cytokine,other
1,IPR001303,Class II aldolase/adducin N-terminal,other
2,IPR033929,"Tensin, phosphotyrosine-binding domain",other
3,IPR011641,Tyrosine-protein kinase ephrin type A/B recept...,other
4,IPR036300,Mir domain superfamily,other
...,...,...,...
621,IPR010442,PET domain,TF
622,IPR038096,TEA/ATTS domain superfamily,TF
623,IPR008967,"p53-like transcription factor, DNA-binding",TF
624,IPR011598,"Myc-type, basic helix-loop-helix (bHLH) domain",TF


#### Only keep IPRs overrepresented in 2 species within a phylum

In [3]:
def retrieve_IPR_2sp(domain_file):
    phylum_df = pd.read_csv(domain_file, sep="\t")
    phylum_df =  phylum_df[phylum_df['species'] >=2 ]
    phylum_df = phylum_df[phylum_df['IPR_id'] != "-"]
    IPR_set = set(phylum_df['IPR_id'])
    return(IPR_set)

#### Function that retrieves intersection between domain sets

In [55]:
def venn_IPR(arthropoda_file, cnidaria_file, mollusca_file):
    # Retrieve IPRs present in at least two species
    arthropoda_IPRs = retrieve_IPR_2sp(arthropoda_domains)
    cnidaria_IPRs = retrieve_IPR_2sp(cnidaria_domains)
    mollusca_IPRs = retrieve_IPR_2sp(mollusca_domains)
    # Retrieve 3 phyla IPRs
    inter_domains = set.intersection(cnidaria_IPRs, mollusca_IPRs, arthropoda_IPRs)
    inter_domains_df = pd.DataFrame(inter_domains, columns=['IPR_id'])
    inter_domains_df = inter_domains_df.merge(all_IPRS_desc, how='left')\
        .merge(IPR_annot_df, how='left')
    # Retrieve 2 phyla IPRs
    inter_cnid_moll = set.intersection(cnidaria_IPRs, mollusca_IPRs) - inter_domains
    inter_cnid_moll_df = pd.DataFrame(inter_cnid_moll, columns=['IPR_id']).merge(all_IPRS_desc, how='left')\
        .merge(IPR_annot_df, how='left')
    inter_arth_moll = set.intersection(arthropoda_IPRs, mollusca_IPRs) - inter_domains
    inter_arth_moll_df = pd.DataFrame(inter_arth_moll, columns=['IPR_id']).merge(all_IPRS_desc, how='left')\
        .merge(IPR_annot_df, how='left')
    inter_arth_cnid = set.intersection(arthropoda_IPRs, cnidaria_IPRs) - inter_domains
    inter_arth_cnid_df = pd.DataFrame(inter_arth_cnid, columns=['IPR_id']).merge(all_IPRS_desc, how='left')\
        .merge(IPR_annot_df, how='left')
    # Plot Venn diagram
    plot = venn3([cnidaria_IPRs, mollusca_IPRs, arthropoda_IPRs], ('Cnidaria', 'Mollusca', 'Arthropoda'))
    return(plot, inter_domains_df, inter_cnid_moll_df, inter_arth_moll_df, inter_arth_cnid_df)

### avg

In [56]:
out_suffix = "avg"

In [57]:
cnidaria_domains = "../avg/sp_counts.tsv"
mollusca_domains = "../../../../mollusca_06_22/overrep_domains/new_parse_gff/avg/sp_counts.tsv"
arthropoda_domains = "../../../../arthropoda_06_22/overrep_domains/new_parse_gff/avg/sp_counts.tsv"

In [58]:
plot, inter_domains_df, inter_cnid_moll_df, inter_arth_moll_df, inter_arth_cnid_df = venn_IPR(cnidaria_domains,
                                                                                           mollusca_domains,
                                                                                           arthropoda_domains)
#plt.savefig('IPR_venn_avg.pdf')

In [59]:
inter_domains_df.to_csv('inter_domains_' + out_suffix + ".tsv" , sep="\t", index=False)
inter_cnid_moll_df.to_csv('inter_cnid_moll_' + out_suffix + ".tsv" , sep="\t", index=False)
inter_arth_moll_df.to_csv('inter_arth_moll_' + out_suffix + ".tsv" , sep="\t", index=False)
inter_arth_cnid_df.to_csv('inter_arth_cnid_' + out_suffix + ".tsv" , sep="\t", index=False)

### avg + stdev

In [60]:
cnidaria_domains = "../avg_stdev/sp_counts.tsv"
mollusca_domains = "../../../../mollusca_06_22/overrep_domains/new_parse_gff/avg_stdev/sp_counts.tsv"
arthropoda_domains = "../../../../arthropoda_06_22/overrep_domains/new_parse_gff/avg_stdev/sp_counts.tsv"

In [61]:
out_suffix = "avg_stdev"

In [62]:
plot, inter_domains_df, inter_cnid_moll_df, inter_arth_moll_df, inter_arth_cnid_df = venn_IPR(cnidaria_domains,
                                                                                           mollusca_domains,
                                                                                           arthropoda_domains)
#plt.savefig('IPR_venn_' + out_suffix + '.pdf')

In [63]:
inter_domains_df.to_csv('inter_domains_' + out_suffix + ".tsv" , sep="\t", index=False)
inter_cnid_moll_df.to_csv('inter_cnid_moll_' + out_suffix + ".tsv" , sep="\t", index=False)
inter_arth_moll_df.to_csv('inter_arth_moll_' + out_suffix + ".tsv" , sep="\t", index=False)
inter_arth_cnid_df.to_csv('inter_arth_cnid_' + out_suffix + ".tsv" , sep="\t", index=False)

### Phylum-specific domains

In [78]:
out_suffix = "avg"

cnidaria_domains = "../avg/sp_counts.tsv"
mollusca_domains = "../../../../mollusca_06_22/overrep_domains/new_parse_gff/avg/sp_counts.tsv"
arthropoda_domains = "../../../../arthropoda_06_22/overrep_domains/new_parse_gff/avg/sp_counts.tsv"

In [79]:
phylum_df = pd.read_csv(cnidaria_domains, sep="\t")
phylum_df

Unnamed: 0,IPR_id,species,description
0,IPR013783,7,Immunoglobulin-like fold
1,IPR000742,6,EGF-like domain
2,IPR012337,6,Ribonuclease H-like superfamily
3,IPR013032,6,"EGF-like, conserved site"
4,IPR009057,5,Homeobox-like domain superfamily
...,...,...,...
143,IPR011029,1,Death-like domain superfamily
144,IPR009017,1,Green fluorescent protein
145,IPR008984,1,SMAD/FHA domain superfamily
146,IPR008936,1,Rho GTPase activation protein


In [80]:
def phylum_specific(arthropoda_file, cnidaria_file, mollusca_file):
    # Retrieve IPRs present in at least two species
    arthropoda_IPRs = retrieve_IPR_2sp(arthropoda_domains)
    cnidaria_IPRs = retrieve_IPR_2sp(cnidaria_domains)
    mollusca_IPRs = retrieve_IPR_2sp(mollusca_domains)
    arthropoda_only = arthropoda_IPRs - mollusca_IPRs - cnidaria_IPRs
    cnidaria_only = cnidaria_IPRs - arthropoda_IPRs - mollusca_IPRs
    mollusca_only = mollusca_IPRs - cnidaria_IPRs - arthropoda_IPRs
    arthropoda_only_df = pd.DataFrame(arthropoda_only, columns=['IPR_id']).merge(all_IPRS_desc, how='left')\
        .merge(IPR_annot_df, how='left')
    cnidaria_only_df = pd.DataFrame(cnidaria_only, columns=['IPR_id']).merge(all_IPRS_desc, how='left')\
        .merge(IPR_annot_df, how='left')
    mollusca_only_df = pd.DataFrame(mollusca_only, columns=['IPR_id']).merge(all_IPRS_desc, how='left')\
        .merge(IPR_annot_df, how='left')
    return(arthropoda_only_df, cnidaria_only_df, mollusca_only_df)

In [81]:
arthropoda_only_df, cnidaria_only_df, mollusca_only_df = phylum_specific(arthropoda_domains, cnidaria_domains, mollusca_domains)

In [82]:
arthropoda_only_df

Unnamed: 0,IPR_id,description,class
0,IPR001523,Paired domain,TF
1,IPR043182,Paired DNA-binding domain,TF
2,IPR026630,EPM2A-interacting protein 1,other
3,IPR013162,"CD80-like, immunoglobulin C2-set",immunoglobulin
4,IPR013106,Immunoglobulin V-set domain,immunoglobulin


In [83]:
arthropoda_only_df.to_csv('arthropoda_only_avg.tsv', sep="\t", index=False)

In [84]:
cnidaria_only_df.to_csv('cnidaria_only_avg.tsv', sep="\t", index=False)

In [85]:
mollusca_only_df.to_csv('mollusca_only_avg.tsv', sep="\t", index=False)

### avg + stdev

In [69]:
cnidaria_domains = "../avg_stdev/sp_counts.tsv"
mollusca_domains = "../../../../mollusca_06_22/overrep_domains/new_parse_gff/avg_stdev/sp_counts.tsv"
arthropoda_domains = "../../../../arthropoda_06_22/overrep_domains/new_parse_gff/avg_stdev/sp_counts.tsv"

In [70]:
arthropoda_only_df, cnidaria_only_df, mollusca_only_df = phylum_specific(arthropoda_domains, cnidaria_domains, mollusca_domains)

In [71]:
arthropoda_only_df

Unnamed: 0,IPR_id,description,class
0,IPR035979,RNA-binding domain superfamily,post-transcriptional


In [72]:
arthropoda_only_df.to_csv('arthropoda_only_avg_dtsev.tsv', sep="\t", index=False)

In [73]:
cnidaria_only_df

Unnamed: 0,IPR_id,description,class
0,IPR012337,Ribonuclease H-like superfamily,TE
1,IPR010285,DNA helicase Pif1-like,
2,IPR038717,"Tc1-like transposase, DDE domain",TE
3,IPR043128,Reverse transcriptase/Diguanylate cyclase domain,TE
4,IPR000152,EGF-type aspartate/asparagine hydroxylation site,
5,IPR011335,Restriction endonuclease type II-like,
6,IPR036910,High mobility group box domain superfamily,TF
7,IPR001881,EGF-like calcium-binding domain,
8,IPR036397,Ribonuclease H superfamily,TE
9,IPR013032,"EGF-like, conserved site",other


In [74]:
cnidaria_only_df.to_csv('cnidaria_only_avg_dtsev.tsv', sep="\t", index=False)

In [75]:
mollusca_only_df

Unnamed: 0,IPR_id,description,class
0,IPR020067,Frizzled domain,dev_signaling
1,IPR001849,Pleckstrin homology domain,other
2,IPR001245,"Serine-threonine/tyrosine-protein kinase, cata...",other
3,IPR035892,C2 domain superfamily,other
4,IPR001478,PDZ domain,other
5,IPR001781,"Zinc finger, LIM-type",TF
6,IPR017995,"Homeobox protein, antennapedia type",homeobox
7,IPR003961,Fibronectin type III,other
8,IPR004088,"K Homology domain, type 1",other
9,IPR011598,"Myc-type, basic helix-loop-helix (bHLH) domain",TF


In [76]:
mollusca_only_df.to_csv('mollusca_only_avg_dtsev.tsv', sep="\t", index=False)

In [None]:
### Read curated IPRs

In [46]:
curated_IPRs = "../../../combine_overrep_analyses/all_IPRs_curated.txt"
IPR_annot_df = pd.read_csv(curated_IPRs, sep="\t")
IPR_annot_df

Unnamed: 0,IPR_id,description,class
0,IPR029034,Cystine-knot cytokine,other
1,IPR001303,Class II aldolase/adducin N-terminal,other
2,IPR033929,"Tensin, phosphotyrosine-binding domain",other
3,IPR011641,Tyrosine-protein kinase ephrin type A/B recept...,other
4,IPR036300,Mir domain superfamily,other
...,...,...,...
621,IPR010442,PET domain,TF
622,IPR038096,TEA/ATTS domain superfamily,TF
623,IPR008967,"p53-like transcription factor, DNA-binding",TF
624,IPR011598,"Myc-type, basic helix-loop-helix (bHLH) domain",TF


In [48]:
mollusca_only_df.merge(IPR_annot_df)

Unnamed: 0,IPR_id,description,class
0,IPR020067,Frizzled domain,dev_signaling
1,IPR001849,Pleckstrin homology domain,other
2,IPR001245,"Serine-threonine/tyrosine-protein kinase, cata...",other
3,IPR035892,C2 domain superfamily,other
4,IPR001478,PDZ domain,other
5,IPR001781,"Zinc finger, LIM-type",TF
6,IPR017995,"Homeobox protein, antennapedia type",homeobox
7,IPR003961,Fibronectin type III,other
8,IPR004088,"K Homology domain, type 1",other
9,IPR011598,"Myc-type, basic helix-loop-helix (bHLH) domain",TF
