#### CTS genes Dillon
- Extract the genes which are enriched in specific cell types using the Dillon et al., 2022 dataset
- Use the dataset as processed by Jeff, which is all the subclustering groups combined and then with FindAllMarkers() rerun

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import gffutils
import scipy.stats as stats
from collections import defaultdict
import warnings

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import PrettyBox
from utilities import load_dataset
from annotation_utilities import *

%load_ext autoreload
%autoreload 2

In [None]:
# Manually added the cluster names from the publication to map the number in the spreadsheet -> the cluster name
# From Fig 2C, same as Fig 1 except Fig 2 missing the low quality category
prog_dict = {0: 'immature neurons', 1: 'GMCs', 2: 'Type II NBs', 3: 'INPs', 4: 'immature neurons', 
             5: 'immature neurons', 6: 'immature neurons', 7: 'low quality', 8: 'new-born neurons',
             9: 'Type I NBs', 10: 'immature neurons', 11: 'immature neurons', 12: 'Quiescent NBs'}

glia_dict = {0: 'cortex/chiasm glia', 1: 'perineural glia', 2: 'astrocytes/neuropil glia', 
             3: 'subperineural glia'}

# From Fig 7C, has mature neurons
neuron_dict = {0: 'cholinergic', 1: 'unannotated', 2: 'GABAergic', 3: 'Glutamatergic', 4: 'undifferentiated', 
               5: 'undifferentiated', 6: 'motor neurons', 7: 'kenyon cells gamma', 8: 'monoaminergic',
               9: 'peptidergic', 10: 'unannotated', 11: 'unannotated', 12: 'octopaminergic', 13: 'neurosecretory cells'}

In [None]:
dillon_dir = '../../resources/other_studies/Dillon_Supplementary/'
prog_file = os.path.join(dillon_dir, '13064_2022_163_MOESM2_ESM.csv')
glia_file = os.path.join(dillon_dir, '13064_2022_163_MOESM3_ESM.csv')
neuron_file = os.path.join(dillon_dir, '13064_2022_163_MOESM16_ESM.csv')
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)

In [None]:
# Find the best genome version to use for these:
all_df = pd.concat([pd.read_csv(prog_file), pd.read_csv(glia_file), pd.read_csv(neuron_file)])
dillon_genes = all_df['gene']
id_dir = '../../resources/id_conversion/'
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')
dmel630_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_05.tsv')
dmel631_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_06.tsv')
dmel632_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_01.tsv')
dmel633_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_02.tsv')
dmel634_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_03.tsv')
dmel635_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_04.tsv')
dmel636_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_05.tsv')

versions = [('dmel630', dmel630_file), ('dmel631', dmel631_file), ('dmel632', dmel632_file), 
                   ('dmel633', dmel633_file), ('dmel634', dmel634_file), ('dmel635', dmel635_file),
                   ('dmel636', dmel636_file)]

best_version, count_dict, notfound = find_best_version(dillon_genes, version_files=versions)
print('best version', best_version)

In [None]:
# Get table converting 6.28 (Thompson) to 6.32 (Dillon)
convert_df = update_ids(dmel632_file, from_version=dmel628_file)
convert_df = convert_df.reset_index().rename(columns={'index':'old_ID'})

In [None]:
# Dillon supplement only contains gene name and no FBgn number at all!
# Get conversion of the gene symbols to the FBg numbers from 6.28
fc_co = 1
pval_co = 0.05
prog_df = pd.read_csv(prog_file).query('p_val_adj < @pval_co & avg_log2FC >= @fc_co')
glia_df = pd.read_csv(glia_file).query('p_val_adj < @pval_co & avg_log2FC >= @fc_co')
neuron_df = pd.read_csv(neuron_file).query('p_val_adj < @pval_co & avg_log2FC >= @fc_co')
# This gives the number of CTS genes per cell type cluster
# prog_df.query('avg_log2FC >= 1').groupby('cluster').count()

prog_df['clustername'] = prog_df['cluster'].map(prog_dict)
neuron_df['clustername'] = neuron_df['cluster'].map(neuron_dict)
glia_df['clustername'] = glia_df['cluster'].map(glia_dict)

for df in [prog_df, neuron_df, glia_df]:
    assert df[pd.isnull(df['clustername'])].empty

In [None]:
# Show that genes don't have to be unique between different clusters
# If len > len unique
print(f'num genes {len(prog_df["gene"])}')
print(f'num unique genes {len(prog_df["gene"].unique())}')

In [None]:
# Write the genelists in 6.28 IDs
atlas_dict = {'progenitor': prog_df, 'neuron': neuron_df, 'glia': glia_df}
for a in atlas_dict:
    df = atlas_dict[a]
    df2 = pd.merge(df[['gene', 'clustername']], convert_df[['new_sym', 'old_ID']], left_on='gene', right_on='new_sym', how='left')
    # Drop any genes that were not present in the version that they are converted to
    df3 = df2.set_index('old_ID')[['clustername']].rename(columns={'clustername':'celltype'}).drop(labels=[np.nan], errors='ignore')
    df3.index.name = ''
    df3.to_csv(os.path.join(outdir, f'dillon_{a}.csv'))

In [None]:
# Test that left merge works for split/merge cases
df1 = pd.DataFrame({'gene':['a', 'b', 'c', 'c']})
# Split case: same gene now has two ids
df2 = pd.DataFrame({'gene':['a', 'a', 'c', 'd'], 'id':[1, 2, 3, 4]})
# Merge case: two genes now have the same id
df3 = pd.DataFrame({'gene':['a', 'b', 'c', 'd'], 'id':[1, 1, 3, 4]})

In [None]:
merge1 = pd.merge(df1, df2, left_on='gene', right_on='gene', how='left')
merge2 = pd.merge(df1, df3, left_on='gene', right_on='gene', how='left')