### Goal

- Retrieve the closest gene of each CNE.
- Calculate proportion of CNEs closest to homeobox genes.

### Input

- CNE coordinates
- Directory of gff files converted to gffutils database
- Interproscan annotations 
- CNE clusters and LCA info

### Output

- closest_gene_counts_by_node.tsv: number of CNEs associated with (closest to) each gene, separated by node of origin.
- all_cne_closest_genes.pickle : dictionary of closest gene of every CNE, including distance
- gene_protein_dict.pickle : dictionary of gene-protein IDs
- gene_IPR_dict.pickle : dictionary of protein domains for each gene
- CNE-IPR.tsv: dataframe linking CNEs with protein domains of closest gene
- homeo_cne_counts_by_sp.tsv : proportion of CNEs closest to homeobox genes

In [170]:
import pandas as pd
import glob
import json
import sys
import pickle
from collections import defaultdict
import csv
import gffutils

#### Import genome annotations (GFF files)
Converted to database with gffutils

In [2]:
gff_dir = "../../distance_GOI_to_CNE/gff_db_files/"

#### Import CNE coordinates

In [3]:
coord_dir = '../../retrieve_original_coords/filtered_coords/'

In [5]:
coord_files = [f for f in glob.glob(coord_dir + '*.tsv' )]
print("Found ", len(coord_files), "files in ", coord_dir)
print(coord_files)

Found  13 files in  ../../retrieve_original_coords/filtered_coords/
['../../retrieve_original_coords/filtered_coords/spis_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/hsym_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/aaur_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/dgig_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/chem_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/ofav_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/aten_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/mvir_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/hvul_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/adig_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/epal_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/pdam_orig_coords.tsv', '../../retrieve_original_coords/filtered_coords/nvec_orig_coords.tsv']


#### Import clustering data

In [4]:
cluster_file = "../../post_parsimony_filtering/filtered_clusters.csv"
cluster_lcas_file = "../../parsimony_analysis/cluster_lcas.pickle"
excluded_cluster_lca_path = "../../remove_select_clusters/excluded_cluster_nodes.tsv"

#### Import Interproscan annotations

In [6]:
interpro_dir = "../../proteome_annotations/interpro_files/"
interpro_files = glob.glob(interpro_dir + '*combined.tsv' )

In [7]:
species_list = ['spis',
 'hsym',
 'aaur',
 'dgig',
 'ofav',
 'aten',
 'mvir',
 'hvul',
 'adig',
 'epal',
 'pdam',
 'nvec']

In [None]:
custom_species = ['chem'] ### annotations parsed differently

### Function to retrieve coordinates of genes

In [11]:
def parse_cds(species):
    gff_file = gff_dir + species + "_gff.db"
    gene_db = gffutils.FeatureDB(gff_file,keep_order=True)
    scaff_genes_dict = defaultdict(list)
    gene_start_dict = {}
    gene_end_dict = {}
    gene_protein_dict = defaultdict(set)
    for gene in gene_db.features_of_type(featuretype="gene"):
        if 'gene_biotype' not in gene.attributes or 'gene_biotype' in gene.attributes and \
            gene.attributes['gene_biotype'][0] == 'protein_coding' :
            gene_id = str(gene.id)
            scaff_genes_dict[gene.chrom].append(gene_id)
            for CDS in gene_db.children(gene.id,featuretype='CDS'):
                if species in ['hsym']:
                    CDS_id = gene_id.split("g")[0]
                elif species in ['aaur', 'mvir']:
                    CDS_id = CDS.id.split(".cds")[0]
                else:
                    CDS_id = "_".join(CDS.id.split("-")[-1].split("_")[:2])
                gene_protein_dict[gene_id].add(CDS_id)
                CDS_start = CDS.start
                CDS_end = CDS.end
                if gene_id not in gene_start_dict:
                    gene_start_dict[gene_id] = CDS_start
                    gene_end_dict[gene_id] = CDS_end
                else:
                    gene_start_dict[gene_id] = min(CDS_start, gene_start_dict[gene_id])
                    gene_end_dict[gene_id] = max(CDS_end, gene_end_dict[gene_id])
    return(scaff_genes_dict, gene_start_dict,gene_end_dict, gene_protein_dict)

#### Specifically for Clytia

In [9]:
def parse_cds_chem(species):
    gff_file = gff_dir + species + "_gff.db"
    gene_db = gffutils.FeatureDB(gff_file,keep_order=True)
    scaff_genes_dict = defaultdict(list)
    gene_start_dict = {}
    gene_end_dict = {}
    gene_protein_dict = defaultdict(set)
    for gene in gene_db.features_of_type(featuretype="gene"):
        gene_id = str(gene.id)
        scaff_genes_dict[gene.chrom].append(gene_id)
        for transcript in gene_db.children(gene.id, featuretype='transcript'):
            #print(transcript.id)
            gene_protein_dict[gene_id].add(transcript.id)
            CDS_start = transcript.start
            CDS_end = transcript.end
            if gene_id not in gene_start_dict:
                gene_start_dict[gene_id] = CDS_start
                gene_end_dict[gene_id] = CDS_end
            else:
                gene_start_dict[gene_id] = min(CDS_start, gene_start_dict[gene_id])
                gene_end_dict[gene_id] = max(CDS_end, gene_end_dict[gene_id])
    return(scaff_genes_dict, gene_start_dict,gene_end_dict, gene_protein_dict)

### Function to find closest gene of each CNE

In [10]:
def retrieve_closest_genes(cne_coords, scaff_genes_dict, gene_start_dict, gene_end_dict):
    cne_closest_genes = {}
    for indx, row in cne_coords.iterrows():
        cne_id = row['cne_id']
        scaffold = row['scaffold']
        cne_start = row['orig_start']
        cne_end = row['orig_end']
        closest_dist = 0
        closest_gene = 'no_gene'
        # Retrieve scaffold corresponding to CNE
        if scaffold in scaff_genes_dict.keys():
            genes_on_scaff = scaff_genes_dict[scaffold]
            # Identify genes within serch distance on scaffold
            for gene in genes_on_scaff:
                gene_start = gene_start_dict[gene]
                gene_end = gene_end_dict[gene]
                dist = min(abs(cne_start - gene_start), abs(cne_end - gene_start), 
                           abs(cne_start - gene_end), abs(cne_end - gene_end))
                if closest_dist == 0 or dist < closest_dist:
                    closest_dist = dist
                    closest_gene = gene
        cne_closest_genes[cne_id] = (closest_gene, closest_dist)
    return(cne_closest_genes)

### Function to parse interpro annotations

In [12]:
def format_interpro_res(interpro_file):
    interpro_results = pd.read_table(interpro_file, names = ('gene', 'identifier', 'length', 'software', 
                                                             'software_id', 'software_prediction', 'start', 'end',
                                                             'score', 'status', 'date', 'IPR_id', 'description'))
    interpro_results = interpro_results.dropna(subset=['IPR_id'])
    interpro_results = interpro_results.reset_index(drop = True)
    interpro_results = interpro_results[['gene', 'software','IPR_id', 'description']]
    interpro_results = interpro_results.drop_duplicates(subset=['gene', 'IPR_id'])
    return interpro_results

### Read cluster info

In [13]:
cne_cluster_dict = {}
with open(cluster_file) as f:
    for line in f: # each line is a comma-separated list of cnes: e.g. aaur_cne_2,aaur_cne_3,mvir_cne_2
        #print(line)
        line=line.strip()
        row = line.split(",") # each CNE in cluster is separated by comma
        cluster_id = row[0]
        for cne in row[1:]:
            cne_cluster_dict[cne] = cluster_id
cne_cluster_df = pd.DataFrame(cne_cluster_dict.items(), columns=['cne_id', 'cluster_id'])
cne_cluster_df

Unnamed: 0,cne_id,cluster_id
0,adig_cne_21,cluster_1
1,adig_cne_26279,cluster_1
2,adig_cne_57,cluster_1
3,aten_cne_3077,cluster_1
4,aten_cne_4904,cluster_1
...,...,...
325459,hsym_cne_10645,cluster_81729
325460,hsym_cne_4294,cluster_81729
325461,hsym_cne_6438,cluster_81729
325462,hsym_cne_8106,cluster_81729


#### Retrieve cluster LCAs

In [14]:
with open(cluster_lcas_file, "rb") as input_file:
    cluster_lcas = pickle.load(input_file)
cluster_lca_df = pd.DataFrame(cluster_lcas.items(), columns=['cluster_id', 'node'])
cluster_lca_df

Unnamed: 0,cluster_id,node
0,cluster_1,cnidaria
1,cluster_10,scleractinia
2,cluster_100,scleractinia
3,cluster_1000,scleractinia
4,cluster_10000,scleractinia
...,...,...
18135,cluster_9995,scleractinia
18136,cluster_9996,scleractinia
18137,cluster_9997,scleractinia
18138,cluster_9998,scleractinia


In [15]:
excluded_cluster_lcas = pd.read_csv(excluded_cluster_lca_path, sep="\t")
excluded_cluster_lcas

Unnamed: 0,cluster_id,node
0,cluster_30687,pocilloporidae
1,cluster_30688,pocilloporidae
2,cluster_30689,pocilloporidae
3,cluster_30690,pocilloporidae
4,cluster_30691,pocilloporidae
...,...,...
51535,cluster_5060,leptothecata
51536,cluster_5061,leptothecata
51537,cluster_5062,leptothecata
51538,cluster_5063,leptothecata


In [16]:
cluster_lca_df = cluster_lca_df.append(excluded_cluster_lcas)
cluster_lca_df

Unnamed: 0,cluster_id,node
0,cluster_1,cnidaria
1,cluster_10,scleractinia
2,cluster_100,scleractinia
3,cluster_1000,scleractinia
4,cluster_10000,scleractinia
...,...,...
51535,cluster_5060,leptothecata
51536,cluster_5061,leptothecata
51537,cluster_5062,leptothecata
51538,cluster_5063,leptothecata


In [17]:
cne_cluster_df = cne_cluster_df.merge(cluster_lca_df, how='left').fillna('ambiguous')
cne_cluster_df

Unnamed: 0,cne_id,cluster_id,node
0,adig_cne_21,cluster_1,cnidaria
1,adig_cne_26279,cluster_1,cnidaria
2,adig_cne_57,cluster_1,cnidaria
3,aten_cne_3077,cluster_1,cnidaria
4,aten_cne_4904,cluster_1,cnidaria
...,...,...,...
325459,hsym_cne_10645,cluster_81729,hydrozoa
325460,hsym_cne_4294,cluster_81729,hydrozoa
325461,hsym_cne_6438,cluster_81729,hydrozoa
325462,hsym_cne_8106,cluster_81729,hydrozoa


In [18]:
set(cne_cluster_df['node'])

{'acraspeda',
 'actiniaria',
 'ambiguous',
 'anthozoa',
 'cnidaria',
 'enthemonae',
 'hexacorallia',
 'hydrozoa',
 'leptothecata',
 'medusozoa',
 'pocilloporidae',
 'robusta',
 'scleractinia'}

#### Set up dictionary of closest gene for each CNE, separated by node of origin

In [149]:
all_cne_closest_genes = {}
all_nodes = set(cne_cluster_df['node'])
for node in all_nodes:
    all_cne_closest_genes[node] = {}
    for species in species_list:
        all_cne_closest_genes[node][species] = {}
all_cne_closest_genes

{'acraspeda': {'spis': {},
  'hsym': {},
  'aaur': {},
  'dgig': {},
  'ofav': {},
  'aten': {},
  'mvir': {},
  'hvul': {},
  'adig': {},
  'epal': {},
  'pdam': {},
  'nvec': {}},
 'hexacorallia': {'spis': {},
  'hsym': {},
  'aaur': {},
  'dgig': {},
  'ofav': {},
  'aten': {},
  'mvir': {},
  'hvul': {},
  'adig': {},
  'epal': {},
  'pdam': {},
  'nvec': {}},
 'robusta': {'spis': {},
  'hsym': {},
  'aaur': {},
  'dgig': {},
  'ofav': {},
  'aten': {},
  'mvir': {},
  'hvul': {},
  'adig': {},
  'epal': {},
  'pdam': {},
  'nvec': {}},
 'cnidaria': {'spis': {},
  'hsym': {},
  'aaur': {},
  'dgig': {},
  'ofav': {},
  'aten': {},
  'mvir': {},
  'hvul': {},
  'adig': {},
  'epal': {},
  'pdam': {},
  'nvec': {}},
 'scleractinia': {'spis': {},
  'hsym': {},
  'aaur': {},
  'dgig': {},
  'ofav': {},
  'aten': {},
  'mvir': {},
  'hvul': {},
  'adig': {},
  'epal': {},
  'pdam': {},
  'nvec': {}},
 'leptothecata': {'spis': {},
  'hsym': {},
  'aaur': {},
  'dgig': {},
  'ofav': {},
 

#### Create dataframe of closest CNE count for each gene and dictionary of gene-protein IDs

In [154]:
output_df = pd.DataFrame(columns=['species', 'gene', 'closest_cne_count', 'cne_node'])
gene_protein_dict_all_sp = {} # Relationship between gene_IDs and protein IDS (to link genes with interpro results)
for coord_file in coord_files:
    species_prefix = coord_file.split("/")[-1].split("_orig")[0]
    print(coord_file)
    print("read coord file and merge with cluster/node info")
    cne_coords = pd.read_csv(coord_file, sep="\t")
    cne_coords = cne_coords.merge(cne_cluster_df)
    print('Read interpro_file')
    interpro_file = interpro_dir + species_prefix +  '_combined.tsv'
    interpro_df = format_interpro_res(interpro_file)
    print('retrieve gene coordinates')
    if species_prefix == 'chem':
        scaff_genes_dict, gene_start_dict, gene_end_dict, gene_protein_dict = parse_cds_chem(species_prefix)
    else:
        scaff_genes_dict, gene_start_dict, gene_end_dict, gene_protein_dict = parse_cds(species_prefix)
    gene_protein_dict_all_sp[species_prefix] = gene_protein_dict
    # Create dictionary of closest_genes
    for node in set(cne_coords['node']):
        print(node)
        cne_coords_node = cne_coords[cne_coords['node']==node]
        cne_closest_genes = retrieve_closest_genes(cne_coords_node, scaff_genes_dict, gene_start_dict, gene_end_dict)
        all_cne_closest_genes[node][species_prefix] = cne_closest_genes 
        cnes_per_gene = defaultdict(int)
        for cne, gene_dist in cne_closest_genes.items():
            cnes_per_gene[gene_dist[0]] += 1
        species_df = pd.DataFrame(cnes_per_gene.items(), columns=['gene', 'closest_cne_count'])
        species_df['cne_node'] = node
        species_df['species'] = species_prefix
        output_df = output_df.append(species_df)
output_df = output_df[output_df['gene'] != 'no_gene']

../../retrieve_original_coords/filtered_coords/spis_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
hexacorallia


  output_df = output_df.append(species_df)


robusta


  output_df = output_df.append(species_df)


cnidaria


  output_df = output_df.append(species_df)


scleractinia


  output_df = output_df.append(species_df)


anthozoa


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


ambiguous
pocilloporidae


  output_df = output_df.append(species_df)


../../retrieve_original_coords/filtered_coords/hsym_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
cnidaria


  output_df = output_df.append(species_df)


leptothecata


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


ambiguous
hydrozoa
medusozoa
../../retrieve_original_coords/filtered_coords/aaur_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
medusozoa
acraspeda
ambiguous
cnidaria
../../retrieve_original_coords/filtered_coords/dgig_orig_coords.tsv
read coord file and merge with cluster/node info


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


Read interpro_file
retrieve gene coordinates
anthozoa


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


ambiguous
cnidaria
../../retrieve_original_coords/filtered_coords/chem_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
cnidaria
leptothecata
ambiguous
hydrozoa
medusozoa


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


../../retrieve_original_coords/filtered_coords/ofav_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
hexacorallia


  output_df = output_df.append(species_df)


robusta


  output_df = output_df.append(species_df)


cnidaria


  output_df = output_df.append(species_df)


scleractinia


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


anthozoa
ambiguous


  output_df = output_df.append(species_df)


../../retrieve_original_coords/filtered_coords/aten_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
hexacorallia
cnidaria
actiniaria
anthozoa


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


ambiguous
enthemonae
../../retrieve_original_coords/filtered_coords/mvir_orig_coords.tsv
read coord file and merge with cluster/node info


  output_df = output_df.append(species_df)


Read interpro_file
retrieve gene coordinates
medusozoa
acraspeda
ambiguous
cnidaria


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


../../retrieve_original_coords/filtered_coords/hvul_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
medusozoa
ambiguous
hydrozoa
cnidaria
../../retrieve_original_coords/filtered_coords/adig_orig_coords.tsv
read coord file and merge with cluster/node info


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


Read interpro_file
retrieve gene coordinates
hexacorallia


  output_df = output_df.append(species_df)


cnidaria


  output_df = output_df.append(species_df)


scleractinia


  output_df = output_df.append(species_df)


anthozoa
ambiguous


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


../../retrieve_original_coords/filtered_coords/epal_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
hexacorallia
cnidaria
actiniaria


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


anthozoa
ambiguous
enthemonae


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


../../retrieve_original_coords/filtered_coords/pdam_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
hexacorallia


  output_df = output_df.append(species_df)


robusta


  output_df = output_df.append(species_df)


cnidaria


  output_df = output_df.append(species_df)


scleractinia


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


anthozoa
ambiguous


  output_df = output_df.append(species_df)


pocilloporidae


  output_df = output_df.append(species_df)


../../retrieve_original_coords/filtered_coords/nvec_orig_coords.tsv
read coord file and merge with cluster/node info
Read interpro_file
retrieve gene coordinates
hexacorallia
cnidaria
actiniaria


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


anthozoa
ambiguous


  output_df = output_df.append(species_df)
  output_df = output_df.append(species_df)


In [155]:
gene_protein_dict_all_sp

{'spis': defaultdict(set,
             {'gene-LOC111326177': {'XP_022785885.1',
               'XP_022785962.1',
               'XP_022786037.1'},
              'gene-LOC111327133': {'XP_022787010.1',
               'XP_022787082.1',
               'XP_022787155.1'},
              'gene-LOC111327047': {'XP_022786908.1'},
              'gene-LOC111328643': {'XP_022788889.1'},
              'gene-LOC111328547': {'XP_022788781.1'},
              'gene-LOC111347478': {'XP_022810485.1'},
              'gene-LOC111325747': {'XP_022785391.1',
               'XP_022785472.1',
               'XP_022785533.1',
               'XP_022785606.1',
               'XP_022785679.1'},
              'gene-LOC111326079': {'XP_022785778.1'},
              'gene-LOC111321864': {'XP_022780618.1'},
              'gene-LOC111319340': {'XP_022777899.1'},
              'gene-LOC111320611': {'XP_022779001.1'},
              'gene-LOC111331661': {'XP_022792588.1'},
              'gene-LOC111321260': {'XP_022780190.

#### Write output file by node

In [156]:
#output_df.to_csv("closest_gene_counts_by_node.tsv", sep="\t", index=False)

#### Save dictionary of closest gene of each CNE

In [157]:
#with open("all_cne_closest_genes.pickle", "wb") as output_file:
#    pickle.dump(all_cne_closest_genes, output_file)

#### Save dictionary of gene-protein IDs

In [158]:
#with open("gene_protein_dict.pickle", "wb") as output_file:
#    pickle.dump(gene_protein_dict_all_sp, output_file)

#### Load data if previous steps have already been run

In [19]:
with open("all_cne_closest_genes.pickle", "rb") as infile:
    all_cne_closest_genes = pickle.load(infile)

In [20]:
with open("gene_protein_dict.pickle", "rb") as infile:
    gene_protein_dict_all_sp = pickle.load(infile)

In [21]:
all_cne_closest_genes

{'acraspeda': {'aaur': {'aaur_cne_1': ('scaffold1.g12', 2277),
   'aaur_cne_2': ('scaffold1.g13', 951),
   'aaur_cne_4': ('scaffold1.g51', 2798),
   'aaur_cne_6': ('scaffold1.g77', 8059),
   'aaur_cne_18': ('scaffold2.g24', 60),
   'aaur_cne_23': ('scaffold2.g88', 3214),
   'aaur_cne_26': ('scaffold3.g30', 1210),
   'aaur_cne_27': ('scaffold3.g31', 3452),
   'aaur_cne_28': ('scaffold3.g37', 387),
   'aaur_cne_30': ('scaffold3.g50', 18169),
   'aaur_cne_37': ('scaffold4.g58', 585),
   'aaur_cne_39': ('scaffold4.g63', 2516),
   'aaur_cne_43': ('scaffold5.g11', 7013),
   'aaur_cne_46': ('scaffold5.g37', 697),
   'aaur_cne_50': ('scaffold5.g57', 1748),
   'aaur_cne_53': ('scaffold5.g76', 7347),
   'aaur_cne_54': ('scaffold6.g10', 963),
   'aaur_cne_56': ('scaffold6.g18', 2059),
   'aaur_cne_61': ('scaffold7.g2', 4063),
   'aaur_cne_63': ('scaffold7.g15', 16332),
   'aaur_cne_65': ('scaffold7.g22', 7709),
   'aaur_cne_66': ('scaffold7.g22', 8119),
   'aaur_cne_69': ('scaffold7.g25', 13172),

#### Remove cnes with no nearby gene (when no gene on same scaffold)

In [22]:
delete = []
for node, species_dict in all_cne_closest_genes.items():
    for species, cne_dict in species_dict.items():
        for cne_id, gene_tuple in cne_dict.items():
            if gene_tuple[0] == 'no_gene':
                delete.append(cne_id)

In [23]:
nodes = list(all_cne_closest_genes.keys())
for node in nodes:
    for cne in delete:
        if cne in all_cne_closest_genes[node]:
            print('delete cne')
            del all_cne_closest_genes[node][cne]

In [24]:
all_cne_closest_genes

{'acraspeda': {'aaur': {'aaur_cne_1': ('scaffold1.g12', 2277),
   'aaur_cne_2': ('scaffold1.g13', 951),
   'aaur_cne_4': ('scaffold1.g51', 2798),
   'aaur_cne_6': ('scaffold1.g77', 8059),
   'aaur_cne_18': ('scaffold2.g24', 60),
   'aaur_cne_23': ('scaffold2.g88', 3214),
   'aaur_cne_26': ('scaffold3.g30', 1210),
   'aaur_cne_27': ('scaffold3.g31', 3452),
   'aaur_cne_28': ('scaffold3.g37', 387),
   'aaur_cne_30': ('scaffold3.g50', 18169),
   'aaur_cne_37': ('scaffold4.g58', 585),
   'aaur_cne_39': ('scaffold4.g63', 2516),
   'aaur_cne_43': ('scaffold5.g11', 7013),
   'aaur_cne_46': ('scaffold5.g37', 697),
   'aaur_cne_50': ('scaffold5.g57', 1748),
   'aaur_cne_53': ('scaffold5.g76', 7347),
   'aaur_cne_54': ('scaffold6.g10', 963),
   'aaur_cne_56': ('scaffold6.g18', 2059),
   'aaur_cne_61': ('scaffold7.g2', 4063),
   'aaur_cne_63': ('scaffold7.g15', 16332),
   'aaur_cne_65': ('scaffold7.g22', 7709),
   'aaur_cne_66': ('scaffold7.g22', 8119),
   'aaur_cne_69': ('scaffold7.g25', 13172),

### Retrieve gene:IPR info for all genes in all genomes


In [163]:
gene_protein_dict_all_sp

{'spis': defaultdict(set,
             {'gene-LOC111326177': {'XP_022785885.1',
               'XP_022785962.1',
               'XP_022786037.1'},
              'gene-LOC111327133': {'XP_022787010.1',
               'XP_022787082.1',
               'XP_022787155.1'},
              'gene-LOC111327047': {'XP_022786908.1'},
              'gene-LOC111328643': {'XP_022788889.1'},
              'gene-LOC111328547': {'XP_022788781.1'},
              'gene-LOC111347478': {'XP_022810485.1'},
              'gene-LOC111325747': {'XP_022785391.1',
               'XP_022785472.1',
               'XP_022785533.1',
               'XP_022785606.1',
               'XP_022785679.1'},
              'gene-LOC111326079': {'XP_022785778.1'},
              'gene-LOC111321864': {'XP_022780618.1'},
              'gene-LOC111319340': {'XP_022777899.1'},
              'gene-LOC111320611': {'XP_022779001.1'},
              'gene-LOC111331661': {'XP_022792588.1'},
              'gene-LOC111321260': {'XP_022780190.

In [164]:
gene_IPR_dict = {}
for species, gene_prot_dict in gene_protein_dict_all_sp.items():
    gene_IPR_dict[species] = {}
    interpro_file = interpro_dir + species +  '_combined.tsv'
    interpro_df = format_interpro_res(interpro_file)
    for gene, prot_set in gene_prot_dict.items():
        gene_IPRs = list(interpro_df[interpro_df['gene'].isin(prot_set)]['IPR_id'])
        #print(gene_IPRs)
        gene_IPR_dict[species][gene] = gene_IPRs

In [165]:
gene_IPR_dict

{'spis': {'gene-LOC111326177': ['IPR027725',
   'IPR000232',
   'IPR036390',
   'IPR036388',
   'IPR027725',
   'IPR000232',
   'IPR036388',
   'IPR036390',
   'IPR000232',
   'IPR036390',
   'IPR027725',
   'IPR036388'],
  'gene-LOC111327133': ['IPR034584', 'IPR034584', 'IPR034584'],
  'gene-LOC111327047': ['IPR011679',
   'IPR017937',
   'IPR036249',
   'IPR013766',
   'IPR036356',
   'IPR005788'],
  'gene-LOC111328643': ['IPR015943',
   'IPR001680',
   'IPR012953',
   'IPR017986',
   'IPR028598',
   'IPR036322'],
  'gene-LOC111328547': ['IPR011989', 'IPR016024', 'IPR030791', 'IPR029249'],
  'gene-LOC111347478': ['IPR004192',
   'IPR005805',
   'IPR037008',
   'IPR014349',
   'IPR017941',
   'IPR036922',
   'IPR006317',
   'IPR015248'],
  'gene-LOC111325747': [],
  'gene-LOC111326079': ['IPR008271', 'IPR000719', 'IPR011009', 'IPR017441'],
  'gene-LOC111321864': ['IPR027805', 'IPR006612', 'IPR038441'],
  'gene-LOC111319340': ['IPR001650', 'IPR014001', 'IPR027417'],
  'gene-LOC11132061

#### Save dictionary of gene-IPR info for later

In [166]:
#with open("gene_IPR_dict.pickle", "wb") as output_file:
#    pickle.dump(gene_IPR_dict, output_file)

#### Load gene-IPR info if it already exists

In [25]:
with open("gene_IPR_dict.pickle", "rb") as infile:
    gene_IPR_dict = pickle.load(infile)

### Link CNE and IPR info

Time consuming

In [26]:
all_cne_closest_genes['acraspeda']

{'aaur': {'aaur_cne_1': ('scaffold1.g12', 2277),
  'aaur_cne_2': ('scaffold1.g13', 951),
  'aaur_cne_4': ('scaffold1.g51', 2798),
  'aaur_cne_6': ('scaffold1.g77', 8059),
  'aaur_cne_18': ('scaffold2.g24', 60),
  'aaur_cne_23': ('scaffold2.g88', 3214),
  'aaur_cne_26': ('scaffold3.g30', 1210),
  'aaur_cne_27': ('scaffold3.g31', 3452),
  'aaur_cne_28': ('scaffold3.g37', 387),
  'aaur_cne_30': ('scaffold3.g50', 18169),
  'aaur_cne_37': ('scaffold4.g58', 585),
  'aaur_cne_39': ('scaffold4.g63', 2516),
  'aaur_cne_43': ('scaffold5.g11', 7013),
  'aaur_cne_46': ('scaffold5.g37', 697),
  'aaur_cne_50': ('scaffold5.g57', 1748),
  'aaur_cne_53': ('scaffold5.g76', 7347),
  'aaur_cne_54': ('scaffold6.g10', 963),
  'aaur_cne_56': ('scaffold6.g18', 2059),
  'aaur_cne_61': ('scaffold7.g2', 4063),
  'aaur_cne_63': ('scaffold7.g15', 16332),
  'aaur_cne_65': ('scaffold7.g22', 7709),
  'aaur_cne_66': ('scaffold7.g22', 8119),
  'aaur_cne_69': ('scaffold7.g25', 13172),
  'aaur_cne_72': ('scaffold7.g43', 

In [27]:
cne_IPR_dict = {}
cne_IPR_df = pd.DataFrame(columns=['node', 'species', 'cne', 'IPR'])
for node, species_dict in all_cne_closest_genes.items():
    print(node)
    cne_IPR_dict[node] = {}
    for species, cne_dict in species_dict.items():
        cne_IPR_dict[node][species] = {}
        for cne_id, gene_tuple in cne_dict.items():
            gene_id = gene_tuple[0]
            if gene_id != 'no_gene':
                IPR_list = gene_IPR_dict[species][gene_id]
                cne_IPR_dict[node][species][cne_id] = IPR_list
                for IPR in IPR_list:
                    row = [node, species, cne_id, IPR]
                    cne_IPR_df.loc[len(cne_IPR_df)] = row

acraspeda
hexacorallia
robusta
cnidaria
scleractinia
leptothecata
actiniaria
anthozoa
ambiguous
enthemonae
pocilloporidae
hydrozoa
medusozoa


#### Write dataframe of CNE-IPR info

In [28]:
#cne_IPR_df.to_csv("cne_IPR.tsv", sep="\t", index=False)

#### Load dataframe of CNE-IPR info if already exists

In [175]:
cne_IPR_df = pd.read_csv("cne_IPR.tsv", sep="\t")

#### Get IPR descriptions 

In [176]:
all_sp_interpro = pd.DataFrame()
for species in species_list:
    interpro_file = interpro_dir + species +  '_combined.tsv'
    interpro_df = format_interpro_res(interpro_file)
    interpro_df = interpro_df.filter(['IPR_id', 'description'])
    all_sp_interpro = all_sp_interpro.append(interpro_df)
all_sp_interpro = all_sp_interpro.drop_duplicates()
all_sp_interpro = all_sp_interpro.rename({'IPR_id': 'IPR'}, axis = 1)

  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)
  all_sp_interpro = all_sp_interpro.append(interpro_df)


#### Calculate percent of CNEs linked to each IPR

In [204]:
cne_IPR_df = cne_IPR_df.drop_duplicates(['cne', 'IPR'])

In [205]:
IPR_counts = cne_IPR_df.filter(['node','cne', 'IPR']).groupby(['node', 'IPR']).count().reset_index()
IPR_counts

Unnamed: 0,node,IPR,cne
0,acraspeda,IPR000008,8
1,acraspeda,IPR000011,2
2,acraspeda,IPR000014,4
3,acraspeda,IPR000033,4
4,acraspeda,IPR000034,5
...,...,...,...
57318,scleractinia,IPR043203,26
57319,scleractinia,IPR043204,14
57320,scleractinia,IPR043205,2
57321,scleractinia,IPR043211,422


In [206]:
IPR_counts.sort_values("cne", ascending=False)

Unnamed: 0,node,IPR,cne
39438,pocilloporidae,IPR027417,6025
37674,pocilloporidae,IPR017452,5909
33662,pocilloporidae,IPR000276,5885
36995,pocilloporidae,IPR013783,5265
36441,pocilloporidae,IPR011009,4613
...,...,...,...
28641,hydrozoa,IPR000244,1
5896,ambiguous,IPR020934,1
5898,ambiguous,IPR020993,1
28646,hydrozoa,IPR000301,1


In [207]:
cluster_lca_dict = cluster_lca_df.set_index('cluster_id').to_dict()['node']

#### Retrieve total number of CNEs for each node

In [208]:
node_cne_counts = defaultdict(int)
filtered_clusters = "../../post_parsimony_filtering/filtered_clusters.csv"
with open(filtered_clusters, "r") as f:
    reader = csv.reader(f)
    for line in reader:
        cluster_id = line[0]
        cne_count = len(line) - 1
        if cluster_id in cluster_lca_dict:
            cluster_node = cluster_lca_dict[cluster_id]
        else:
            cluster_node = 'ambiguous' # clusters with 1 at some node, but also pops up somewhere else
        node_cne_counts[cluster_node] += cne_count
        #print(cluster_id, cne_count)
        #print(line)
node_cne_counts

defaultdict(int,
            {'cnidaria': 50905,
             'scleractinia': 76111,
             'anthozoa': 13684,
             'hexacorallia': 22783,
             'ambiguous': 8047,
             'medusozoa': 965,
             'leptothecata': 2476,
             'hydrozoa': 1940,
             'enthemonae': 3366,
             'actiniaria': 2124,
             'robusta': 20777,
             'pocilloporidae': 119950,
             'acraspeda': 2336})

In [209]:
node_cne_counts_df = pd.DataFrame(node_cne_counts.items(), columns=['node', 'node_cne_count'])
node_cne_counts_df

Unnamed: 0,node,node_cne_count
0,cnidaria,50905
1,scleractinia,76111
2,anthozoa,13684
3,hexacorallia,22783
4,ambiguous,8047
5,medusozoa,965
6,leptothecata,2476
7,hydrozoa,1940
8,enthemonae,3366
9,actiniaria,2124


In [211]:
IPR_counts[IPR_counts['node'] == 'robusta']

Unnamed: 0,node,IPR,cne
43157,robusta,IPR000001,29
43158,robusta,IPR000003,1
43159,robusta,IPR000007,1
43160,robusta,IPR000008,63
43161,robusta,IPR000010,4
...,...,...,...
48776,robusta,IPR043203,37
48777,robusta,IPR043204,1
48778,robusta,IPR043205,1
48779,robusta,IPR043211,119


#### Calculate % of CNEs linked to each interproscan protein domain, per node

In [212]:
pct_counts = IPR_counts.merge(node_cne_counts_df)
pct_counts['pct_cnes'] = 100 * pct_counts['cne'] / pct_counts['node_cne_count']
pct_counts = pct_counts.merge(all_sp_interpro)
pct_counts = pct_counts.sort_values(['node', 'pct_cnes'], ascending = [True, False])
pct_counts

Unnamed: 0,node,IPR,cne,node_cne_count,pct_cnes,description
594,acraspeda,IPR000477,137,2336,5.864726,Reverse transcriptase domain
11660,acraspeda,IPR027417,79,2336,3.381849,P-loop containing nucleoside triphosphate hydr...
14491,acraspeda,IPR036691,57,2336,2.440068,Endonuclease/exonuclease/phosphatase superfamily
7445,acraspeda,IPR012337,45,2336,1.926370,Ribonuclease H-like superfamily
8827,acraspeda,IPR015943,44,2336,1.883562,WD40/YVTN repeat-like-containing domain superf...
...,...,...,...,...,...,...
57310,scleractinia,IPR042165,1,76111,0.001314,Phosphatidylglycerophosphatase and protein-tyr...
57311,scleractinia,IPR042289,1,76111,0.001314,Cytochrome c oxidase assembly factor 6 homolog
57313,scleractinia,IPR042357,1,76111,0.001314,WAP four-disulfide core domain protein 1
57314,scleractinia,IPR042450,1,76111,0.001314,Eukaryotic translation elongation factor 1 eps...


#### Write file with %CNE per IPR for each node

In [213]:
for node in set(pct_counts['node']):
    node_pct_counts = pct_counts[pct_counts['node'] == node]
    node_pct_counts.to_csv(node + "_pct_counts.tsv", sep="\t", index=False)

#### Retrieve top 20 IPRs for each node and write to file

In [214]:
top_IPRs = pct_counts.groupby('node').head(20)
top_IPRs

Unnamed: 0,node,IPR,cne,node_cne_count,pct_cnes,description
594,acraspeda,IPR000477,137,2336,5.864726,Reverse transcriptase domain
11660,acraspeda,IPR027417,79,2336,3.381849,P-loop containing nucleoside triphosphate hydr...
14491,acraspeda,IPR036691,57,2336,2.440068,Endonuclease/exonuclease/phosphatase superfamily
7445,acraspeda,IPR012337,45,2336,1.926370,Ribonuclease H-like superfamily
8827,acraspeda,IPR015943,44,2336,1.883562,WD40/YVTN repeat-like-containing domain superf...
...,...,...,...,...,...,...
2580,scleractinia,IPR001881,1507,76111,1.980003,EGF-like calcium-binding domain
7710,scleractinia,IPR013083,1479,76111,1.943215,"Zinc finger, RING/FYVE/PHD-type"
7383,scleractinia,IPR011990,1398,76111,1.836791,Tetratricopeptide-like helical domain superfamily
142,scleractinia,IPR000152,1318,76111,1.731681,EGF-type aspartate/asparagine hydroxylation site


In [215]:
top_IPRs.to_csv('top_IPRs.tsv', sep="\t", index=False)

#### Retrieve all IPRs representing > 0.5% of CNEs

In [216]:
pct_thresh = 0.5
pct_counts_pct_thresh = pct_counts[pct_counts['pct_cnes'] > pct_thresh].sort_values(['node', 'pct_cnes'], ascending = [True, False])

In [217]:
pct_counts_pct_thresh.to_csv('top_IPRs_0.5pct.tsv', sep="\t", index=False)

#### Retrieve percentage of cnes mapping to homeodomain

In [218]:
homeo_IPRids = ['IPR009057', 'IPR017970', 'IPR001356', 'IPR020479', 'IPR008422', 'IPR032967',
               'IPR032453', 'IPR000747' ] 

In [233]:
pct_homeo = pct_counts[pct_counts['IPR'] == 'IPR009057'] # homeodomain
pct_homeo['pct_cnes'] = round(pct_homeo['pct_cnes'], 2)
pct_homeo = node_cne_counts_df.merge(pct_homeo, on=['node', 'node_cne_count'], how='left')
pct_homeo.cne = pct_homeo.cne.fillna(0)
pct_homeo.pct_cnes = pct_homeo.pct_cnes.fillna(0)
pct_homeo.cne = pct_homeo.cne.astype(int)
pct_homeo.IPR = pct_homeo.IPR.fillna('IPR009057')
pct_homeo.description = pct_homeo.description.fillna('Homeobox-like domain superfamily')
pct_homeo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pct_homeo['pct_cnes'] = round(pct_homeo['pct_cnes'], 2)


Unnamed: 0,node,node_cne_count,IPR,cne,pct_cnes,description
0,cnidaria,50905,IPR009057,372,0.73,Homeobox-like domain superfamily
1,scleractinia,76111,IPR009057,960,1.26,Homeobox-like domain superfamily
2,anthozoa,13684,IPR009057,94,0.69,Homeobox-like domain superfamily
3,hexacorallia,22783,IPR009057,255,1.12,Homeobox-like domain superfamily
4,ambiguous,8047,IPR009057,66,0.82,Homeobox-like domain superfamily
5,medusozoa,965,IPR009057,3,0.31,Homeobox-like domain superfamily
6,leptothecata,2476,IPR009057,15,0.61,Homeobox-like domain superfamily
7,hydrozoa,1940,IPR009057,19,0.98,Homeobox-like domain superfamily
8,enthemonae,3366,IPR009057,141,4.19,Homeobox-like domain superfamily
9,actiniaria,2124,IPR009057,50,2.35,Homeobox-like domain superfamily


In [220]:
pct_all_homeo = pct_counts[pct_counts['IPR'].isin(homeo_IPRids)] # Not that helpful
pct_all_homeo['pct_cnes'] = round(pct_all_homeo['pct_cnes'], 2)
pct_all_homeo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pct_all_homeo['pct_cnes'] = round(pct_all_homeo['pct_cnes'], 2)


Unnamed: 0,node,IPR,cne,node_cne_count,pct_cnes,description
6534,acraspeda,IPR009057,27,2336,1.16,Homeobox-like domain superfamily
1813,acraspeda,IPR001356,17,2336,0.73,Homeobox domain
9378,acraspeda,IPR017970,14,2336,0.60,"Homeobox, conserved site"
10291,acraspeda,IPR020479,4,2336,0.17,"Homeobox domain, metazoa"
6535,actiniaria,IPR009057,50,2124,2.35,Homeobox-like domain superfamily
...,...,...,...,...,...,...
9390,scleractinia,IPR017970,530,76111,0.70,"Homeobox, conserved site"
10302,scleractinia,IPR020479,251,76111,0.33,"Homeobox domain, metazoa"
18908,scleractinia,IPR008422,69,76111,0.09,Homeobox KN domain
22076,scleractinia,IPR032453,64,76111,0.08,"Homeobox protein PKNOX/Meis, N-terminal"


In [234]:
pct_all_homeo.to_csv('pct_all_homeo.tsv', sep="\t", index=False)
pct_homeo.to_csv('pct_homeo.tsv', sep="\t", index=False)

### Calculate proportion per species (not by node)

In [222]:
#### Count CNEs per species
sp_cne_counts = defaultdict(int)
with open(filtered_clusters, "r") as f:
    reader = csv.reader(f)
    for line in reader:
        cluster_id = line[0]
        for cne in line[1:]:
            sp = cne.split("_")[0]
            sp_cne_counts[sp]+=1
sp_cne_counts

defaultdict(int,
            {'adig': 50811,
             'aten': 5576,
             'dgig': 3644,
             'epal': 5313,
             'ofav': 43954,
             'pdam': 85198,
             'spis': 110941,
             'aaur': 2049,
             'chem': 2352,
             'hsym': 6532,
             'hvul': 2540,
             'mvir': 2730,
             'nvec': 3824})

In [223]:
sp_cne_counts_df = pd.DataFrame(sp_cne_counts.items(), columns=['species', 'total_cne_count'])
sp_cne_counts_df

Unnamed: 0,species,total_cne_count
0,adig,50811
1,aten,5576
2,dgig,3644
3,epal,5313
4,ofav,43954
5,pdam,85198
6,spis,110941
7,aaur,2049
8,chem,2352
9,hsym,6532


In [224]:
cnes_with_IPR = cne_IPR_df.drop_duplicates('cne').groupby('species')['cne'].count().reset_index().\
    rename({'cne':'cnes_with_IPR_info'}, axis=1)
cnes_with_IPR

Unnamed: 0,species,cnes_with_IPR_info
0,aaur,1216
1,adig,39097
2,aten,4432
3,chem,1075
4,dgig,2807
5,epal,4114
6,hsym,5184
7,hvul,1664
8,mvir,2012
9,nvec,2819


In [225]:
homeo_cne_counts = cne_IPR_df[cne_IPR_df['IPR'].isin(homeo_IPRids)].drop_duplicates('cne')[['species', 'IPR']]\
        .groupby('species').count().rename({'IPR':'homeo_cne_count'}, axis=1).reset_index()\
        .merge(sp_cne_counts_df).merge(cnes_with_IPR)
homeo_cne_counts['homeo_pct_cnes'] = round(100*homeo_cne_counts['homeo_cne_count']/\
                                           homeo_cne_counts['total_cne_count'],2)
homeo_cne_counts['homeo_pct_cnes_with_info'] = round(100*homeo_cne_counts['homeo_cne_count']/\
                                            homeo_cne_counts['cnes_with_IPR_info'],2)
homeo_cne_counts

Unnamed: 0,species,homeo_cne_count,total_cne_count,cnes_with_IPR_info,homeo_pct_cnes,homeo_pct_cnes_with_info
0,aaur,15,2049,1216,0.73,1.23
1,adig,491,50811,39097,0.97,1.26
2,aten,135,5576,4432,2.42,3.05
3,chem,8,2352,1075,0.34,0.74
4,dgig,24,3644,2807,0.66,0.86
5,epal,131,5313,4114,2.47,3.18
6,hsym,42,6532,5184,0.64,0.81
7,hvul,23,2540,1664,0.91,1.38
8,mvir,30,2730,2012,1.1,1.49
9,nvec,50,3824,2819,1.31,1.77


#### ADD % of homeo genes in each genome for context

In [226]:
homeo_counts = defaultdict(int)
for species, IPR_dict in gene_IPR_dict.items():
    for gene, IPR_list in IPR_dict.items():
        if any(x in homeo_IPRids for x in IPR_list):
            homeo_counts[species] +=1
homeo_counts

defaultdict(int,
            {'spis': 193,
             'hsym': 148,
             'aaur': 111,
             'dgig': 196,
             'chem': 128,
             'ofav': 204,
             'aten': 189,
             'mvir': 217,
             'hvul': 253,
             'adig': 197,
             'epal': 199,
             'pdam': 184,
             'nvec': 227})

In [227]:
genes_with_ipr_counts = {}
for species, ipr_dict in gene_IPR_dict.items():
    genes_with_ipr_counts[species] = len(ipr_dict)
genes_with_IPR_df = pd.DataFrame(genes_with_ipr_counts.items(), columns=['species', 'genes_with_IPR'])
genes_with_IPR_df

Unnamed: 0,species,genes_with_IPR
0,spis,24846
1,hsym,22022
2,aaur,30167
3,dgig,22045
4,chem,45872
5,ofav,25929
6,aten,19980
7,mvir,24278
8,hvul,20058
9,adig,26073


In [228]:
homeo_counts_df = pd.DataFrame(homeo_counts.items(), columns=['species', 'num_homeo_genes'])
homeo_cne_counts = homeo_cne_counts.merge(homeo_counts_df).merge(genes_with_IPR_df)
homeo_cne_counts['pct_homeo_genes_w_IPR'] = round(100*homeo_cne_counts['num_homeo_genes']/\
                                                homeo_cne_counts['genes_with_IPR'], 2)
homeo_cne_counts

Unnamed: 0,species,homeo_cne_count,total_cne_count,cnes_with_IPR_info,homeo_pct_cnes,homeo_pct_cnes_with_info,num_homeo_genes,genes_with_IPR,pct_homeo_genes_w_IPR
0,aaur,15,2049,1216,0.73,1.23,111,30167,0.37
1,adig,491,50811,39097,0.97,1.26,197,26073,0.76
2,aten,135,5576,4432,2.42,3.05,189,19980,0.95
3,chem,8,2352,1075,0.34,0.74,128,45872,0.28
4,dgig,24,3644,2807,0.66,0.86,196,22045,0.89
5,epal,131,5313,4114,2.47,3.18,199,22509,0.88
6,hsym,42,6532,5184,0.64,0.81,148,22022,0.67
7,hvul,23,2540,1664,0.91,1.38,253,20058,1.26
8,mvir,30,2730,2012,1.1,1.49,217,24278,0.89
9,nvec,50,3824,2819,1.31,1.77,227,23845,0.95


In [229]:
gene_count_dict = {}
for species, gene_dict in gene_protein_dict_all_sp.items():
    gene_count = len(gene_dict)
    gene_count_dict[species] = gene_count
total_gene_counts = pd.DataFrame(gene_count_dict.items(), columns=['species', 'total_gene_count'])
total_gene_counts

Unnamed: 0,species,total_gene_count
0,spis,24846
1,hsym,22022
2,aaur,30167
3,dgig,22045
4,chem,45872
5,ofav,25929
6,aten,19980
7,mvir,24278
8,hvul,20058
9,adig,26073


In [230]:
### Add total number of genes
homeo_cne_counts = homeo_cne_counts.merge(total_gene_counts)
homeo_cne_counts['pct_homeo_genes_total'] = round(100*(homeo_cne_counts['num_homeo_genes']/\
                                                      homeo_cne_counts['total_gene_count']),2)
homeo_cne_counts

Unnamed: 0,species,homeo_cne_count,total_cne_count,cnes_with_IPR_info,homeo_pct_cnes,homeo_pct_cnes_with_info,num_homeo_genes,genes_with_IPR,pct_homeo_genes_w_IPR,total_gene_count,pct_homeo_genes_total
0,aaur,15,2049,1216,0.73,1.23,111,30167,0.37,30167,0.37
1,adig,491,50811,39097,0.97,1.26,197,26073,0.76,26073,0.76
2,aten,135,5576,4432,2.42,3.05,189,19980,0.95,19980,0.95
3,chem,8,2352,1075,0.34,0.74,128,45872,0.28,45872,0.28
4,dgig,24,3644,2807,0.66,0.86,196,22045,0.89,22045,0.89
5,epal,131,5313,4114,2.47,3.18,199,22509,0.88,22509,0.88
6,hsym,42,6532,5184,0.64,0.81,148,22022,0.67,22022,0.67
7,hvul,23,2540,1664,0.91,1.38,253,20058,1.26,20058,1.26
8,mvir,30,2730,2012,1.1,1.49,217,24278,0.89,24278,0.89
9,nvec,50,3824,2819,1.31,1.77,227,23845,0.95,23845,0.95


In [231]:
homeo_cne_counts

Unnamed: 0,species,homeo_cne_count,total_cne_count,cnes_with_IPR_info,homeo_pct_cnes,homeo_pct_cnes_with_info,num_homeo_genes,genes_with_IPR,pct_homeo_genes_w_IPR,total_gene_count,pct_homeo_genes_total
0,aaur,15,2049,1216,0.73,1.23,111,30167,0.37,30167,0.37
1,adig,491,50811,39097,0.97,1.26,197,26073,0.76,26073,0.76
2,aten,135,5576,4432,2.42,3.05,189,19980,0.95,19980,0.95
3,chem,8,2352,1075,0.34,0.74,128,45872,0.28,45872,0.28
4,dgig,24,3644,2807,0.66,0.86,196,22045,0.89,22045,0.89
5,epal,131,5313,4114,2.47,3.18,199,22509,0.88,22509,0.88
6,hsym,42,6532,5184,0.64,0.81,148,22022,0.67,22022,0.67
7,hvul,23,2540,1664,0.91,1.38,253,20058,1.26,20058,1.26
8,mvir,30,2730,2012,1.1,1.49,217,24278,0.89,24278,0.89
9,nvec,50,3824,2819,1.31,1.77,227,23845,0.95,23845,0.95


#### Write homeodomain-related stats to file

In [232]:
homeo_cne_counts.to_csv('homeo_cne_counts_by_sp.tsv', sep="\t", index=False)