#### Prep genelists ###
- Prepare genelists, converting IDs to the version used (r6.28)

In [None]:
#Imports
import sys
import os
import pandas as pd

sys.path.append('../scripts')

from annotation_utilities import *
from plot_helpers import *

%load_ext autoreload
%autoreload 2

In [None]:
## Flybase ID problem explanation
from IPython.display import Image
Image('../../resources/flybase_files/FlybaseID_problem.png')

In [None]:
outdir = '../Figures/genesets'
os.makedirs(outdir, exist_ok=True)

In [None]:
# Load all the genes possibly in the dataset and convert to version 6.32
result_file = os.path.join(results_dir, 'gene_quantification/summary_abundance_by_gene.csv')
result_genes = list(set(pd.read_csv(result_file, index_col=0).index))

In [None]:
# Create the ID mapping table. The index is the old ID and the columns contain the newID
id_dir = '../../resources/id_conversion/'
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')
dmel632_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_01.tsv')
# df = update_ids(result_genes, dmel628_file, dmel632_file)
# df = update_ids(dmel632_file, dmel628_file, genes=result_genes)
df = update_ids(dmel632_file, dmel628_file)

# This will print the ones with no new ID, which could be due to a withdrawn gene model
# print(df[pd.isnull(df['new_ID'])])

In [None]:
# Create the ID mapping table for 6.28->6.40. This is needed for the neurite enriched genes
dmel640_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2021_03.tsv')
df_dmel640 = update_ids(dmel640_file, dmel628_file)

In [None]:
# Mark which genes are TFs, using the gene table from dmel6.32
res_dir = '../../resources/genelists/'
tf_file = os.path.join(res_dir, 'TFs_dmel632.txt')
tf_ids = set(pd.read_csv(tf_file, sep='\t', header=None)[0].tolist())
# Get TF genelist membership by using the 6.32 IDs
df['TF'] = df['new_ID'].isin(tf_ids)
TF_IDs_converted = df.query('TF').reset_index()['index'].to_csv(os.path.join(outdir, 'all_TFs.csv'), header=False, index=False)

In [None]:
print('original number of TFs %s' % len(tf_ids))
print('number of converted TFs %s' % len(df.query('TF')))

In [None]:
# Convert the neurite genes from 6.32 (output with DIOPT v9) -> 6.28
# Look at overlap with Kugelgen neurite enrichment review:
# Output the ENSMUS symbols for DIOPT tool
# the table is Extended table 2: https://bimsbstatic.mdc-berlin.de/chekulaeva/Supplementary_online_tables.html
nmj_dir = '/Users/mk/Desktop/Davislab_old/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/'
neur_file = os.path.join(nmj_dir, 'neural_loc/Kugelgen_enriched.csv')
neur_outfile = os.path.join(nmj_dir, 'neural_loc/Kugelgen_enriched_ensmus.csv')
neur_df = pd.read_csv(neur_file)
neur_df[neur_df['Datasets with significant neurite enrichment (p<0.1)']>= 3]['gene_id'].to_csv(neur_outfile, index=False, header=False)


https://www.flyrnai.org/cgi-bin/DRSC_orthologs_v09.pl
The Kugelgen_enriched_ensmus genes were then put into DIOPT v9.0 web app and output as Kugelgen_enriched_fly.csv
Your 2712 query symbols mapped to 2695 genes. 250 of them had no orthologs. The output file was saved and should be IDs in 6.40 version



In [None]:
neur_df_fly = pd.read_csv(os.path.join(res_dir, 'diopt_9_neurite_enriched_fromENSM.csv'))
neurite_fly_640 = set(neur_df_fly.query('`DIOPT Score` >= 8')['Fly Species Gene ID'])
# Get neurite genelist membership by using the 6.40 IDs
df_dmel640['neurite'] = df_dmel640['new_ID'].isin(neurite_fly_640)
df['neurite'] = df['new_ID'].isin(neurite_fly_640)
df['neurite_628mapped'] = df.index.isin(neurite_fly_640)

# Write the ones mapping to 6.32 version out as the 6.28 IDs
df.query('neurite').reset_index()['index'].to_csv(os.path.join(outdir, 'neurite_localised_628.csv'), header=False, index=False)
# df_dmel640.query('neurite').reset_index()['index'].to_csv(os.path.join(outdir, 'neurite_localised.csv'), header=False, index=False)
# More of the genes map from 6.32 -> 6.28 than from 6.40 -> 6.28

In [None]:
# Find the best genome version to use for these:
# all_df = pd.concat([pd.read_csv(prog_file), pd.read_csv(glia_file), pd.read_csv(neuron_file)])
# dillon_genes = all_df['gene']
id_dir = '../../resources/id_conversion/'
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')
dmel630_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_05.tsv')
dmel631_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_06.tsv')
dmel632_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_01.tsv')
dmel633_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_02.tsv')
dmel634_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_03.tsv')
dmel635_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_04.tsv')
dmel636_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_05.tsv')

versions = [('dmel630', dmel630_file), ('dmel631', dmel631_file), ('dmel632', dmel632_file), 
                   ('dmel633', dmel633_file), ('dmel634', dmel634_file), ('dmel635', dmel635_file),
                   ('dmel636', dmel636_file)]

best_version, count_dict, notfound = find_best_version(neurite_fly_640, version_files=versions, id_type='FB')
print('best version', best_version)

In [None]:
# Genes which are not found in my dataset, 632 mapped
neurite_fly_640.difference(df.query('neurite')['new_ID'])
# I checked and this gene is in 632, but it doesn't exist in 628 which is why it doesn't end up in the list of genes

In [None]:
# Genes which are not found in my dataset, 628 mapped
neurite_fly_640.difference(df.query('neurite_628mapped')['new_ID'])

In [None]:
# Genes which are not found in my dataset, 640 mapped
neurite_fly_640.difference(df_dmel640.query('neurite')['new_ID'])
# These first two are the current ID for v6.32, so that's what it makes sense to use

In [None]:
neurite_indataset = set(df_dmel640.query('neurite')['new_ID'])

In [None]:
len(neurite_indataset)

In [None]:
len(neurite_fly_640)

In [None]:
len(df_dmel640.query('neurite'))