### Goal

Output dataframe of CNE-gene distance data for homeobox genes VS other genes, for plotting in R.

### Input

- Directory of pickled dictionaries of distance between every gene and its closest CNE (generated with distance_to_CNE.py)
- Interproscan protein domain info for all genes

### Output

species + '_homeo_dists_for_plot.tsv' ; dataframe ready for plotting in R

In [7]:
import pandas as pd
import glob
import sys
import gffutils
import pickle5 as pickle
from collections import defaultdict

#### Import distance data

In [3]:
distance_pickle_dir = "distance_pickles/"

In [5]:
species = 'spis'
dist_file = distance_pickle_dir + species + '_closest_CNE_dists.pickle'

In [8]:
with open(dist_file, "rb") as input_file:
    dist_dict = pickle.load(input_file)

In [9]:
dist_dict

{'gene-LOC111326177': 0,
 'gene-LOC111327133': 286,
 'gene-LOC111327047': 210,
 'gene-LOC111328643': 106,
 'gene-LOC111328547': 0,
 'gene-LOC111347478': 2946,
 'gene-LOC111325747': 0,
 'gene-LOC111326079': 325,
 'gene-LOC111321864': 2720,
 'gene-LOC111319340': 520,
 'gene-LOC111320611': 402,
 'gene-LOC111331661': 2082,
 'gene-LOC111321260': 62,
 'gene-LOC111339961': 482,
 'gene-LOC111344587': 189,
 'gene-LOC111331958': 197,
 'gene-LOC111345918': 2503,
 'gene-LOC111342729': 643,
 'gene-LOC111332051': 472,
 'gene-LOC111328189': 1049,
 'gene-LOC111328095': 47,
 'gene-LOC111328282': 4120,
 'gene-LOC111328459': 0,
 'gene-LOC111343985': 0,
 'gene-LOC111346822': 59,
 'gene-LOC111343739': 507,
 'gene-LOC111332645': 539,
 'gene-LOC111327642': 327,
 'gene-LOC111327736': 16,
 'gene-LOC111327487': 0,
 'gene-LOC111325339': 152,
 'gene-LOC111325259': 22,
 'gene-LOC111339837': 500,
 'gene-LOC111333068': 241,
 'gene-LOC111332979': 168,
 'gene-LOC111320584': 1509,
 'gene-LOC111346018': 199,
 'gene-LOC1

#### Import IPR info for all genes

In [11]:
gene_IPR_file = "../../find_closest_gene/new_parse_gff/gene_IPR_dict.pickle"

In [12]:
with open(gene_IPR_file, "rb") as input_file:
    gene_IPR_dict = pickle.load(input_file)

In [14]:
gene_IPR_dict['spis']

{'gene-LOC111326177': ['IPR027725',
  'IPR000232',
  'IPR036390',
  'IPR036388',
  'IPR027725',
  'IPR000232',
  'IPR036388',
  'IPR036390',
  'IPR000232',
  'IPR036390',
  'IPR027725',
  'IPR036388'],
 'gene-LOC111327133': ['IPR034584', 'IPR034584', 'IPR034584'],
 'gene-LOC111327047': ['IPR011679',
  'IPR017937',
  'IPR036249',
  'IPR013766',
  'IPR036356',
  'IPR005788'],
 'gene-LOC111328643': ['IPR015943',
  'IPR001680',
  'IPR012953',
  'IPR017986',
  'IPR028598',
  'IPR036322'],
 'gene-LOC111328547': ['IPR011989', 'IPR016024', 'IPR030791', 'IPR029249'],
 'gene-LOC111347478': ['IPR004192',
  'IPR005805',
  'IPR037008',
  'IPR014349',
  'IPR017941',
  'IPR036922',
  'IPR006317',
  'IPR015248'],
 'gene-LOC111325747': [],
 'gene-LOC111326079': ['IPR008271', 'IPR000719', 'IPR011009', 'IPR017441'],
 'gene-LOC111321864': ['IPR027805', 'IPR006612', 'IPR038441'],
 'gene-LOC111319340': ['IPR001650', 'IPR014001', 'IPR027417'],
 'gene-LOC111320611': ['IPR008160'],
 'gene-LOC111331661': ['IPR0

### Retrieve distances from homeobox genes VS all other genes

In [32]:
species_list = ['spis', 'hsym', 'aaur', 'dgig', 'chem', 'ofav', 'aten',
               'mvir', 'hvul', 'adig', 'epal', 'pdam', 'nvec']

In [17]:
homeo_IPRids = ['IPR009057', 'IPR017970', 'IPR001356', 'IPR020479', 'IPR008422', 'IPR032967',
               'IPR032453', 'IPR000747']     

In [34]:
for species in species_list:
    dist_file = distance_pickle_dir + species + '_closest_CNE_dists.pickle'
    with open(dist_file, "rb") as input_file:
        closest_CNE_dists = pickle.load(input_file)
    GOIs = [] ### Genes of interest (in this case, homeobox genes)
    others = []
    for gene, IPR_list in gene_IPR_dict[species].items():
        if  any(x in homeo_IPRids for x in IPR_list):
            GOIs.append(gene)
        else:
            others.append(gene)
    output_df = pd.DataFrame(columns=['gene', 'dist_to_CNE', 'gene_set'])
    if len(GOIs) > 0:
        temp_dict = {}
        for gene in GOIs:
            dist = closest_CNE_dists[gene]
            temp_dict[gene] = dist
        temp_df = pd.DataFrame(temp_dict.items(), columns=['gene', 'dist_to_CNE'])
        temp_df['gene_set'] = 'homeo'
        output_df = pd.concat([output_df, temp_df])
        temp_dict = {}
        for gene in others:
            dist = closest_CNE_dists[gene]
            temp_dict[gene] = dist
        temp_df = pd.DataFrame(temp_dict.items(), columns=['gene', 'dist_to_CNE'])
        temp_df['gene_set'] = 'others'
        output_df = pd.concat([output_df, temp_df])
        ### Remove genes without closest CNE
        output_df = output_df[output_df['dist_to_CNE'] != 1000000000000] ### Arbitrary high value used in distance_to_gene.py
        output_df.to_csv(species + '_homeo_dists_for_plot.tsv', sep="\t", index=False)