#### CTS genes Corrales
- Extract the genes which are enriched in specific cell types using the Corrales et al., 2022 dataset
- Issues:
- I originally tried using the annotations from v6.13 to convert the IDs, but there are some genes, like FBgn0287720, which don't even show up in 6.32. I therefore suspect that they are using a quite new version of the annotations which is unspecified
- In this notebook I empirically determined that the best match is 6.34, and I will use those to convert the symbols to the 6.28 IDs

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import gffutils
import scipy.stats as stats
from collections import defaultdict
import warnings

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import PrettyBox
from utilities import load_dataset
from annotation_utilities import *

%load_ext autoreload
%autoreload 2

In [None]:
# Get all the genes reported by Corrales et al., 2022 and convert the IDs
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)
dfs = []
# Load the genes enriched from single-cell sequencing in L3 brains
sc_file = '../../resources/other_studies/Corrales_Supplementary/Supplementary_spreadsheet_2_Ncells_and_gene_markers_per_cluster.xlsx'
xl = pd.ExcelFile(sc_file)
sheets = xl.sheet_names
sheets.remove('Ncells-cluster')
celltype_replace = {'34 Immat': '34 Immat N'}
for i in range(0, len(sheets)):
    df = pd.read_excel(sc_file, sheets[i])
    dfs.append(df)
big_df = pd.concat(dfs)


In [None]:
# Find the best genome version to use for these:
corrales_genes = big_df['gene']
id_dir = '../../resources/id_conversion/'
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')
dmel630_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_05.tsv')
dmel631_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_06.tsv')
dmel632_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_01.tsv')
dmel633_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_02.tsv')
dmel634_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_03.tsv')
dmel635_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_04.tsv')
dmel636_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_05.tsv')

versions = [('dmel630', dmel630_file), ('dmel631', dmel631_file), ('dmel632', dmel632_file), 
                   ('dmel633', dmel633_file), ('dmel634', dmel634_file), ('dmel635', dmel635_file),
                   ('dmel636', dmel636_file)]

best_version, count_dict, notfound = find_best_version(corrales_genes, version_files=versions)
print('best version', best_version)

In [None]:
# Get table converting 6.28 (Thompson) to 6.34 (Corrales)
convert_df = update_ids(dmel634_file, from_version=dmel628_file)
convert_df = convert_df.reset_index().rename(columns={'index':'old_ID'})

In [None]:
# Get all the genes in each cell type
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)

pval_co = 0.05
# fc_co is in log2
fc_co = 1

# Load the genes enriched from single-cell sequencing in L3 brains
sc_file = '../../resources/other_studies/Corrales_Supplementary/Supplementary_spreadsheet_2_Ncells_and_gene_markers_per_cluster.xlsx'
xl = pd.ExcelFile(sc_file)
sheets = xl.sheet_names
sheets.remove('Ncells-cluster')
ct_names = []
dfs = []
# Replace some names because of typos in the spreadsheet
celltype_replace = {'34 Immat': '34 Immat N'}
marker_dict = {}
for i in range(0, len(sheets)):
    df = pd.read_excel(sc_file, sheets[i])
    celltype = sheets[i].strip()
    if celltype in celltype_replace:
        celltype = celltype_replace[celltype]

    this_df = df.query('p_val_adj < @pval_co & avg_log2FC >= @fc_co').copy()
    transgenes = ['nls-tdTomato', 'myr-GFP-p10']
    this_df = this_df[~this_df['gene'].isin(transgenes)].copy()
    this_df['celltype'] = celltype
    dfs.append(this_df)
    ct_names.append(celltype)
cts_df = pd.concat(dfs)

In [None]:
# Combine cell types of the same name to form a general cell type, e.g. Immat N
# Alter this now so that it accepts a dataframe as the input
def is_int(string):
    try:
        return int(string)
    except:
        return None

def clean_name(string):
    return ' '.join(string.split(' ')[1:])

clean_names = [i for i in ct_names if is_int(i) is None]

celltypes = defaultdict(set)
for i in clean_names:
    name = clean_name(i)
    celltypes[name].add(i)

# Combine redundant categories, e.g. hemocytes will be labelled as hemos
ct_combine = {'Hemocytes':'Hemos', 'KCs N':'KCs'}
celltypes2 = {}
for i in celltypes:
    if i in ct_combine:
        old_label = i
        new_label = ct_combine[i]
    else:
        new_label = i
    if new_label not in celltypes2:
        celltypes2[new_label] = celltypes[i]
    else:
        celltypes2[new_label].update(celltypes[i])

celltypes_map = {}
for ct in celltypes2:
    for label in celltypes2[ct]:
        celltypes_map[label] = ct

In [None]:
# Reduce to unique genes by celltype and write output file
# https://stackoverflow.com/questions/36106490/how-to-get-unique-values-from-multiple-columns-in-a-pandas-groupby
cts_df['celltype_general'] = cts_df['celltype'].map(celltypes_map)
g = cts_df.groupby('celltype_general')['gene'].agg(['unique']).explode('unique').rename(columns={'unique':'gene'}).reset_index()
df2 = pd.merge(g[['gene', 'celltype_general']], convert_df[['new_sym', 'old_ID']], left_on='gene', right_on='new_sym', how='left')
df3 = df2.set_index('old_ID').drop(labels=[np.nan])
df3.index.name = ''
df3.rename(columns={'celltype_general':'celltype'})[['celltype']].to_csv(os.path.join(os.path.join(outdir, 'corrales_celltypes.csv')))