### Devreg php
- Show overlap with the Pherson php RNAi knockdown
- The data used is re-analyzed data from Pherson et al., 2017

In [None]:
# Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
from collections import defaultdict
import pickle
from itertools import chain, combinations
import gffutils
import re
from copy import copy

sys.path.append('../scripts')
from plot_helpers import *
from annotation_utilities import *
from plotting_fxns import PrettyBox, get_boxtop, add_stars, plot_scatter, compare_experiments
from utilities import load_dataset 
db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Devreg'
os.makedirs(outdir, exist_ok=True)

In [None]:
# Parse the results here:
# The _filtered.csv file already has the spike-ins and rRNA genes removed
res_file = os.path.join(results_dir_pherson, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = load_dataset(res_file, '../Figures/summary_files/BG3_passed.csv')

# get the means by experiment
mean_df = df.groupby(['gene', 'condition', 'RNAtype']).mean()

In [None]:
# Read in the PCG domain genes and convert them to v6.28
PCG_domain_file = os.path.join(pherson_dir,'1700944_FileS8.txt')
pcg_genes = pd.read_csv(PCG_domain_file, sep='\t', header=None)
# Parse the genes which have been converted to 6.32 using the Flybase web ID converter
# pcg_conv = pd.read_csv(os.path.join(outdir, 'pcg_fb632_IDs.txt'), sep='\t')
pcg_conv = pd.read_csv('../../resources/genelists/pcg_fb632_IDs.txt', sep='\t')
# Print out the ones with unknown ID, which need to be entered manually, match also on genomic position
unk_genes = pcg_conv.loc[pcg_conv['current_id'] == 'unknown ID', '# submitted_id'].values
# Manually researched IDs to match the unknown IDs in 6.32
unk_ids = {'AP-2':'FBgn0261953', 'msa':'FBgn0264857'}
print('unknown genes:')
print(pcg_genes.loc[pcg_genes[3].isin(unk_genes)])
# Remove ones from another species => current_symbol contains a backslash \
m = re.compile('\\\\')
pcg_conv['nonDmel'] = pcg_conv['current_symbol'].apply(lambda x: True if m.search(x) else False)
pcg_conv = pcg_conv.query('~nonDmel').copy()
pcg_conv = pcg_conv.loc[pcg_conv['current_id'] != 'unknown ID'].copy()
pcg_conv = pcg_conv.set_index('# submitted_id', drop=False)
pcg_conv.index.name = 'index'
pcg_conv2 = resolve_splits(pcg_conv, old_sym='# submitted_id', new_sym='current_symbol', new_ID='converted_id')
# Check if any genes have been converted
print(f'Genes after resolve split are equal to input genes: {pcg_conv.equals(pcg_conv2)}')
# Add the unknown genes to the dataframe
unk_df = pd.DataFrame.from_dict(unk_ids, orient='index')
unk_df.columns = ['converted_id']
pcg_conv2 = pd.concat([pcg_conv2, unk_df])

In [None]:
# Create the ID mapping table. The index is the old ID and the columns contain the newID
id_dir = '../../resources/id_conversion/'
# I don't know the annotation version used, so try to convert using the current ones
dmel632_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_01.tsv')
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')
# Get the mapping between 628 -> 32
thompson_gene_df = update_ids(dmel632_file, dmel628_file)

In [None]:
# Map the PCG genes from v6.32 -> v6.28
thompson_gene_df['PCG'] = thompson_gene_df['new_ID'].isin(pcg_conv2['converted_id'])
pcg_genes = set(thompson_gene_df.query('PCG').index.tolist())

In [None]:
# Scatterplot comparisons of the experiments in BG3 cells
val_col = 'summed_tpm_recalc'
experiments_total = [{'condition':'mock', 'RNAtype':'input'}, {'condition':'ph', 'RNAtype':'input'}]
experiments_nas = [{'condition':'mock', 'RNAtype':'pd'}, {'condition':'ph', 'RNAtype':'pd'}]
loc2 = plticker.MultipleLocator(base=2.0)
loc5 = plticker.MultipleLocator(base=5.0)

# Plot the change in nascent RNA after php knockdown
nas_cdf = compare_experiments(mean_df.reset_index(), experiments=experiments_nas, id_col='gene', val_col=val_col, pseudo='min', log=True)
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
ax = plot_scatter(nas_cdf, experiments=['summed_tpm_recalc_x', 'summed_tpm_recalc_y'], id_col='gene', genegroup=pcg_genes, 
            xlabel='mock nascent RNA\n (log'r'$_{10}$'' TPM)', ylabel='$\it{ph\ RNAi}$ nascent RNA\n (log'r'$_{10}$'' TPM)', rsquare=True, loc=loc2,
            diagonal=True, ax=ax)
ax.text(0.1, 0.7, 'PcG domain\ngenes', color=color_dict['purple'], fontsize = 7, transform=ax.transAxes)
ax.set_aspect('equal')
ax.set_ylabel(ax.get_ylabel(), loc='bottom', ma='center')
plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_nas_scat'), out_fmt), dpi = out_dpi)

# Plot the change in total RNA after php knockown
tot_cdf = compare_experiments(mean_df.reset_index(), experiments=experiments_total, id_col='gene', val_col=val_col, pseudo='min', log=True)
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
ax = plot_scatter(tot_cdf, experiments=['summed_tpm_recalc_x', 'summed_tpm_recalc_y'], id_col='gene', genegroup=pcg_genes, 
            xlabel='mock total RNA\n (log'r'$_{10}$'' TPM)', ylabel='$\it{ph\ RNAi}$ total RNA\n (log'r'$_{10}$'' TPM)', rsquare=True, loc=loc2, 
            diagonal=True, ax=ax)
ax.set_aspect('equal')
ax.set_ylabel(ax.get_ylabel(), loc='bottom', ma='center')
plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_tot_scat'), out_fmt), dpi = out_dpi)

# Get the change in total and the change in nascent
mutant_name = 'summed_tpm_recalc_y'
mock_name = 'summed_tpm_recalc_x'
tot_cdf['log_change'] = tot_cdf[mutant_name] - tot_cdf[mock_name]
nas_cdf['log_change'] = nas_cdf[mutant_name] - nas_cdf[mock_name]
comp_df = pd.merge(nas_cdf[['gene', 'log_change']], tot_cdf[['gene','log_change']], left_on='gene', right_on='gene', suffixes=('_nas', '_tot'))
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
ax = plot_scatter(comp_df, experiments=['log_change_nas', 'log_change_tot'], id_col='gene', genegroup=pcg_genes, 
            xlabel=r'$\Delta$''nascent\n(log'r'$_{10}$'' $\it{ph\ RNAi}$ / mock)', ylabel=r'$\Delta$''total\n(log'r'$_{10}$'' $\it{ph\ RNAi}$ / mock)', rsquare=True, loc=loc2, 
            diagonal=True, ax=ax)
plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_nas_tot_scat'), out_fmt), dpi = out_dpi)

In [None]:
# Calculate change in total vs. change in nascent for polycomb targets and make boxplot
comp_df['deltaTN'] = comp_df['log_change_tot'] - comp_df['log_change_nas']

fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
ax = PrettyBox(data=comp_df, x='group', y='deltaTN', fliersize=0, ax=ax)
ax.set_ylim(-3, 3)
ax.set_ylabel(r'$\Delta$total / $\Delta$nascent'+ '\n' + r'(log$_2$ $\it{ph\ RNAi}$ / mock)')
gene_counts = comp_df['group'].value_counts()
ax.set(xticklabels=['other\nn = %s' % gene_counts[False], 'PcG domain\nn = %s' % gene_counts[True]])
ax.set_xlabel('')
ax.annotate('polycomb'+'\n'+'status:', xy=(0, 0), xycoords=ax.get_xaxis_transform(),
                   xytext=(-24,-20), textcoords='offset points', ha='right')


sub_x = comp_df.loc[comp_df['group']]['deltaTN'].values
bg_x = comp_df.loc[~comp_df['group']]['deltaTN'].values

_, p = stats.mannwhitneyu(sub_x, bg_x)

h = max(get_boxtop(comp_df, col1='group', val1=True, val_col='deltaTN'),
            get_boxtop(comp_df, col1='group', val1=False, val_col='deltaTN'))


add_stars(0, 1, h+0.2, 0.1, p, ax)
print('pvalue', p)
print('median T/N pcg', 2**comp_df[comp_df['group']]['deltaTN'].median())
print('median T/N other', 2**comp_df[~comp_df['group']]['deltaTN'].median())

plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_nas_tot_box'), out_fmt), dpi = out_dpi)