#### CTS genes
- Extract the genes which are enriched in specific cell types using the Cocanougher et al., 2022 dataset
- Issues:
- I originally tried using the annotations from v6.13 to convert the IDs, but there are some genes, like FBgn0287720, which don't even show up in 6.32. I therefore suspect that they are using a quite new version of the annotations which is unspecified
- Many of the genes have no FBg numbers, but rather only symbols. No idea what they did
- To get around this I have manually put the symbol only genes into the Flybase ID validator on version 6.48

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import gffutils
import scipy.stats as stats
from collections import defaultdict
import warnings

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import PrettyBox
from utilities import load_dataset
from annotation_utilities import *

%load_ext autoreload
%autoreload 2

In [None]:
# Create the ID mapping table. The index is the old ID and the columns contain the newID
id_dir = '../../resources/id_conversion/'
# I don't know the annotation version used, so try to convert using the current ones
dmel648_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2022_05.tsv')
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')

In [None]:
# Get all the genes reported by Corrales et al., 2022 and convert the IDs
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)
dfs = []
# Load the genes enriched from single-cell sequencing in L3 brains
sc_file = '../../resources/other_studies/Corrales_Supplementary/Supplementary_spreadsheet_2_Ncells_and_gene_markers_per_cluster.xlsx'
xl = pd.ExcelFile(sc_file)
sheets = xl.sheet_names
sheets.remove('Ncells-cluster')
celltype_replace = {'34 Immat': '34 Immat N'}
for i in range(0, len(sheets)):
    df = pd.read_excel(sc_file, sheets[i])
    dfs.append(df)
big_df = pd.concat(dfs)


In [None]:
# Split the genes into those with an FBg ID and those without
FBgs = big_df['FBgn_ID'].dropna()
lncs = big_df.loc[pd.isnull(big_df['FBgn_ID']), 'gene'].dropna()

In [None]:
# Create the ID mapping table. The index is the old ID and the columns contain the newID
id_dir = '../../resources/id_conversion/'
# I don't know the annotation version used, so try to convert using the current ones
dmel648_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2022_05.tsv')
corrales_gene_df = update_ids(dmel648_file, genes=FBgs)
# This will print the ones with no new ID, which could be due to a withdrawn gene model


In [None]:
# Get the mapping between 628 -> 648
thompson_gene_df = update_ids(dmel648_file, dmel628_file)

In [None]:
# Convert the genes with symbols only to 6.48
# Try to resolve the ones with symbols using current annotation versions
# corrales_lnc_df = update_ids(dmel648_file, dmel648_file, id_type='symbol', genes=lncs)
# This doesn't work because 104 genes cannot be mapped this way.
# In total there are 193 genes which have only a symbol specified and no FBg

# ID Resolution using the Flybase ID validator:
pd.DataFrame(lncs.unique()).to_csv(os.path.join(outdir, 'corrales_symbol_only_genes.csv'), header=False, index=False)
# Used the Flybase ID validator from 2022_05 to convert the symbols from unknown version to 2022_05, this step done manually on the web
val_df = pd.read_csv(os.path.join(outdir, 'corrales_id_validation_table.txt'), sep='\t')
# Drop IDs not corresponding to genes, i.e. balancers, etc.
val_df['is_gene'] = val_df['validated_id'].apply(lambda x: x.startswith('FBg'))
val_df = val_df.query('is_gene').drop('is_gene', axis=1)
val_df = val_df.set_index('#submitted_item', drop=False)
val_df.index.name = 'index'
lnc_df = resolve_splits(val_df, old_sym='#submitted_item', new_sym='current_symbol', new_ID='validated_id')
lnc_df = lnc_df[['validated_id', 'current_symbol']].copy()
lnc_df.columns = ['new_ID', 'new_sym']
corrales_gene_df = pd.concat([corrales_gene_df, lnc_df])
# Now for each gene group, we could map from index -> newID which would give us the id in 6.48 space
# Then later we can convert 6.28 to 6.48 using the thompson_gene_df

In [None]:
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)

pval_co = 0.05
# fc_co is in log2
fc_co = 1

# Load the genes enriched from single-cell sequencing in L3 brains
sc_file = '../../resources/other_studies/Corrales_Supplementary/Supplementary_spreadsheet_2_Ncells_and_gene_markers_per_cluster.xlsx'
xl = pd.ExcelFile(sc_file)
sheets = xl.sheet_names
sheets.remove('Ncells-cluster')
ct_names = []
# Replace some names because of typos in the spreadsheet
celltype_replace = {'34 Immat': '34 Immat N'}
marker_dict = {}
for i in range(0, len(sheets)):
    df = pd.read_excel(sc_file, sheets[i])
    celltype = sheets[i].strip()
    if celltype in celltype_replace:
        celltype = celltype_replace[celltype]

    genes = df.loc[(df['p_val_adj'] < pval_co) & (df['avg_log2FC'] >= fc_co)]
    # Remove this artificial transgene
    transgenes = ['nls-tdTomato', 'myr-GFP-p10']
    genes = genes[~genes['gene'].isin(transgenes)].copy()
    normal_genes = genes.dropna(subset=['FBgn_ID'])['FBgn_ID'].values.tolist()
    lnc_genes = genes.loc[pd.isnull(genes['FBgn_ID']), 'gene'].values.tolist()
    # Get the gene IDs in 6.48 equivalent
    marker_genes = corrales_gene_df.loc[corrales_gene_df.index.isin((normal_genes + lnc_genes)), 'new_ID'].values
    # Add to 6.28 table
    thompson_gene_df[celltype] = thompson_gene_df['new_ID'].isin(marker_genes)
    len_original = len((lnc_genes+normal_genes))
    len_converted = len(thompson_gene_df[thompson_gene_df[celltype]])
    try:
        assert len_original == len_converted
    except:
        old_set = set(normal_genes + lnc_genes)
        new_set = set(thompson_gene_df[thompson_gene_df[celltype]]['new_ID'].tolist())
        print(old_set.difference(new_set))
        # This one gene, CG function unknown, was not present in the 6.28 annotations, so don't use
        # thompson_gene_df.query('new_ID == "FBgn0287218"')
    ct_names.append(celltype)

In [None]:
# Combine cell types of the same name to form a general cell type, e.g. Immat N
# Alter this now so that it accepts a dataframe as the input
def is_int(string):
    try:
        return int(string)
    except:
        return None

def clean_name(string):
    return ' '.join(string.split(' ')[1:])

clean_names = [i for i in ct_names if is_int(i) is None]

celltypes = defaultdict(set)
for i in clean_names:
    name = clean_name(i)
    celltypes[name].add(i)

# Combine redundant categories, e.g. hemocytes will be labelled as hemos
ct_combine = {'Hemocytes':'Hemos', 'KCs N':'KCs'}
celltypes2 = {}
for i in celltypes:
    if i in ct_combine:
        old_label = i
        new_label = ct_combine[i]
    else:
        new_label = i
    if new_label not in celltypes2:
        celltypes2[new_label] = celltypes[i]
    else:
        celltypes2[new_label].update(celltypes[i])

# Get the CTS genes, converted to 6.28 FBgs
marker_dict = defaultdict(set)
for i in celltypes2:
    marker_dict[i] = thompson_gene_df.loc[thompson_gene_df[list(celltypes2[i])].any(axis=1)].index

ct_names2 = list(marker_dict.keys())
clean_names2 = [i for i in ct_names2 if is_int(i) is None]

# In order to allow for genes to overlap cell types, we need to duplicate the gene index
d = defaultdict(list)
for cat in marker_dict:
    for gene in marker_dict[cat]:
        d[gene].append(cat)

# Need to do this instead of melt inorder to allow genes to be a member of more than one category
# https://stackoverflow.com/questions/42869544/dictionary-of-lists-to-dataframe
cts_df = pd.DataFrame(pd.concat({k: pd.Series(v) for k, v in d.items()}).droplevel(1), columns=['celltype'])
# Write CTS outfile for later use:
cts_df.to_csv(os.path.join(outdir, f'cts_celltypes_log%1.2f.csv' % fc_co))