### Devreg php
- Show overlap with the Pherson php RNAi knockdown
- The data used is re-analyzed data from Pherson et al., 2017

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
from collections import defaultdict
import pickle
from itertools import chain, combinations
import gffutils
import re
from copy import copy

sys.path.append('../scripts')
from plot_helpers import *
from annotation_utilities import *
from plotting_fxns import PrettyBox, get_boxtop, add_stars
from utilities import load_dataset 
db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Devreg'
os.makedirs(outdir, exist_ok=True)

In [None]:
#Parse the results here:
#The _filtered.csv file already has the spike-ins and rRNA genes removed
res_file = os.path.join(results_dir_pherson, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = load_dataset(res_file, '../Figures/summary_files/BG3_passed.csv')

#get the means by experiment
mean_df = df.groupby(['gene', 'condition', 'RNAtype']).mean()
a = np.unique(mean_df['summed_tpm_recalc'])
lowest_nonzero = a[a>0][0]
#Add the min value to all genes to avoid -inf after log transformation
mean_df['summed_tpm_recalc'] += lowest_nonzero
mean_df['exonic_tpm_recalc'] += lowest_nonzero
mean_df['intronic_tpm_recalc'] += lowest_nonzero

In [None]:
#Plot the results of the php RNAi and highlight the ones which are also H3K27me3 targets
pherson_dir = '/Users/mkthompson/Desktop/Davislab/C3.1_stability_pathway_analysis/polycomb_targets/pherson_dros_cells/'

#Gene target groups defined by Chip
# hiPh_loMe_file = os.path.join(pherson_dir,'1700944_FileS5.txt')
# hiPh_hiMe_file = os.path.join(pherson_dir,'1700944_FileS7.txt')
PCG_domain_file = os.path.join(pherson_dir,'1700944_FileS8.txt')

#hiPh_loMe_targets = pd.read_csv(hiPh_loMe_file, sep = '\t', header = None)['2'].tolist()
bed_header = ['chrom', 'start', 'end', 'gene', 'frame', 'strand']
# hiPh_loMe_targets = set(pd.read_csv(hiPh_loMe_file, sep = '\t', header = None, names = bed_header)['gene'].tolist())
# hiPh_hiMe_targets = set(pd.read_csv(hiPh_hiMe_file, sep = '\t', header = None, names = bed_header)['gene'].tolist())
PCG_domain_targets = set(pd.read_csv(PCG_domain_file, sep = '\t', header = None, names = bed_header)['gene'].tolist())

In [None]:
# Read in the converted PCG gene names and convert to FB numbers
# NOTE: STILL NEED TO CONVERT GENELIST TO THE FBG 6.28 VERSION
pcg_genes = '../../../resources/curated_genelists/pherson_pcg.txt'
pcg_df = pd.read_csv(pcg_genes, sep='\t')
pcg_df.columns = pcg_df.columns.str.strip()
pcg_df.drop('Unnamed: 0', axis=1, inplace=True)
for i in pcg_df.columns:
    pcg_df[i] = pcg_df[i].str.strip()
pcg_df = pcg_df[pcg_df['Data Class'] == 'Gene'].copy()
pcg_df['Submitted Item'] = pcg_df['Submitted Item'].str.rstrip(' !')
pcg_df['Validated ID'] = pcg_df['Validated ID'].str.rstrip(' !')

#Get the number of hits per gene
num_hits = pcg_df.groupby('Submitted Item').apply(len)
unique_hits = num_hits[num_hits == 1].index

pcg_df['found_one'] = (pcg_df['Submitted Item'] == pcg_df['Related record'])
pcg_df['genegroup_found'] = pcg_df.groupby('Submitted Item')['found_one'].transform(max)

#Match on the unique hits, n=108
match1 = pcg_df[pcg_df['Submitted Item'].isin(unique_hits)].copy()
pcg_df.drop(match1.index, inplace=True)

#Match on ones where there are multiple hits, but select the same name for Related record
match2 = pcg_df[pcg_df['Submitted Item'] == pcg_df['Related record']].copy()
pcg_df.drop(match2.index, inplace=True)
pcg_df.drop(pcg_df[pcg_df['genegroup_found']].index, inplace=True)

#For the remaining two, I manually decided by looking at flybase
#AP-2 => TfAP-2 #only one with AP-2 actually listed in the synonyms
#bxd => lncRNA:bxd. I do not know why the UbxPRE is listed as a gene from conversion tool. 
#On Flybase it is listed as a regulatory region
#bxd is also alias for Ubx, but this appears separately in their list
match3 = pcg_df[pcg_df['Related record'].isin(['lncRNA:bxd', 'TfAP-2'])].copy()
matches = pd.concat([match1, match2, match3])
pcg_genes = set(matches['Validated ID'].values.tolist())
print('%s genes converted to FBg' % len(pcg_genes))

In [None]:
def compare_exps(xname='mock', yname='ph', rnatype='input', valuename='summed_tpm_recalc'):
    #exonic_tpm_recalc
    #intronic_tpm_recalc
    print('valuename', valuename)
    mean_df['log10_tpm'] = mean_df[valuename].apply(np.log10)
    mean_df.dropna(subset=['log10_tpm'], how='any', inplace=True)
    mean_df['log2_tpm'] = mean_df[valuename].apply(np.log2)
    mean_df.dropna(subset=['log2_tpm'], how='any', inplace=True)
    #Compare the 60 min input samples to the 0 min input samples
    df_mock = mean_df.loc[pd.IndexSlice[:, xname, rnatype], [valuename, 'log10_tpm', 'log2_tpm']].reset_index(['condition', 'RNAtype']).copy()
    df_ph = mean_df.loc[pd.IndexSlice[:, yname, rnatype], [valuename, 'log10_tpm', 'log2_tpm']].reset_index(['condition', 'RNAtype']).copy()
    comp_df = pd.merge(df_mock, df_ph, left_index=True, right_index=True, suffixes=(f'_{xname}', f'_{yname}'))
    comp_df.dropna(subset=[f'{valuename}_mock', f'{valuename}_ph'], how='any', inplace=True)
    return comp_df

def plot_scatter(df, xval, yval, genegroup=None, grouplabel=None, figname=None):
    '''
    Plot scatter plot and highlight the Polycomb targets
    '''
    fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
    ax = fig.add_subplot(111)
    x = df[xval]
    y = df[yval]
    ax.scatter(x, y, s=5, color='k', alpha=0.3, ec='none')
    
    if genegroup is not None:
        df['group'] = df.index.isin(genegroup)
        x1 = df[df['group']][xval]
        y1 = df[df['group']][yval]
        ax.scatter(x1, y1, s=5, color=color_dict['purple'], ec='none', label=grouplabel)
     
    rval, pval = stats.pearsonr(x, y)
    r2_val_av = rval**2
    loc = plticker.MultipleLocator(base=5.0)
    ax.xaxis.set_major_locator(loc)
    ax.yaxis.set_major_locator(loc)
    ax.text(0.1, 0.9, 'r'r'$^2$'' = %1.2f' % r2_val_av, fontsize = 8, transform=ax.transAxes)
    return ax

In [None]:
tot_comp_df = compare_exps(valuename='summed_tpm_recalc')
nas_comp_df = compare_exps(rnatype='pd', valuename='summed_tpm_recalc')

mutant_name = 'log2_tpm_ph'
mock_name = 'log2_tpm_mock'

tot_comp_df['log_change'] = tot_comp_df[mutant_name] - tot_comp_df[mock_name]
nas_comp_df['log_change'] = nas_comp_df[mutant_name] - nas_comp_df[mock_name]

comp_df = pd.merge(nas_comp_df[['log_change']], tot_comp_df[['log_change']], left_index=True, right_index=True,
                   suffixes=('_nas', '_tot'))

#Plot change in nascent RNA after php knockdown
xval='log10_tpm_mock'
yval='log10_tpm_ph'
min_val = math.floor(min(nas_comp_df[[xval, yval]].min().min(), tot_comp_df[[xval, yval]].min().min())-0.1)
max_val = math.ceil(max(nas_comp_df[[xval, yval]].max().max(), tot_comp_df[[xval, yval]].max().max())+0.1)

loc2 = plticker.MultipleLocator(base=2.0)

#Plot change in total RNA after php knockdown
ax = plot_scatter(tot_comp_df, xval='log10_tpm_mock', yval='log10_tpm_ph', genegroup=pcg_genes, grouplabel='polycomb\ntargets')
ax.set_xlabel('mock total RNA\n (log'r'$_{10}$'' TPM)')
ax.set_ylabel('ph RNAi total RNA\n (log'r'$_{10}$'' TPM)')
ax.set_xlim(min_val, max_val)
ax.set_ylim(min_val, max_val)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, color = color_dict['grey'], alpha = 0.8, zorder=0)
# ax.legend(bbox_to_anchor=(0.5, 1), loc='lower center')
# ax.legend()
ax.text(0.1, 0.7, 'polycomb\ntargets', color=color_dict['purple'], fontsize = 7, transform=ax.transAxes)

ax.xaxis.set_major_locator(loc2)
ax.yaxis.set_major_locator(loc2)
plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_tot_scat'), out_fmt), dpi = out_dpi)

#Plot change in nascent RNA after php knockdown
ax = plot_scatter(nas_comp_df, xval='log10_tpm_mock', yval='log10_tpm_ph', genegroup=pcg_genes, grouplabel='polycomb targets')
ax.set_xlabel('mock nascent RNA\n (log'r'$_{10}$'' TPM)')
ax.set_ylabel('ph RNAi nascent RNA\n (log'r'$_{10}$'' TPM)')
ax.set_xlim(min_val, max_val)
ax.set_ylim(min_val, max_val)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, color = color_dict['grey'], alpha = 0.8, zorder=0)
# ax.legend(bbox_to_anchor=(0.5, 1), loc='lower center')
ax.xaxis.set_major_locator(loc2)
ax.yaxis.set_major_locator(loc2)
plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_nas_scat'), out_fmt), dpi = out_dpi)

#Plot change in total vs. nascent RNA after php knockdown
ax = plot_scatter(comp_df, xval='log_change_nas', yval='log_change_tot', genegroup=pcg_genes, grouplabel='polycomb targets')
ax.set_xlabel('$\Delta$nascent RNA\n(log'r'$_{2}$'' ph / mock)')
ax.set_ylabel('$\Delta$total RNA\n(log'r'$_{2}$'' ph / mock)')

# ax.set_xlim(-20, 20)
# ax.set_ylim(-20, 20)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, color = color_dict['grey'], alpha = 0.8, zorder=0)
# ax.legend(bbox_to_anchor=(0.5, 1), loc='lower center')
loc = plticker.MultipleLocator(base=10.0)
ax.xaxis.set_major_locator(loc)
ax.yaxis.set_major_locator(loc)
plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_nas_tot_scat'), out_fmt), dpi = out_dpi)
#The low TPM genes which don't agree well between datasets deflate the r2 value a fair bit

In [None]:
#Calculate change in total vs. change in nascent for polycomb targets and make boxplot
comp_df['deltaTN'] = comp_df['log_change_tot'] - comp_df['log_change_nas']

fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
ax = PrettyBox(data=comp_df, x='group', y='deltaTN', fliersize=0, ax=ax)
ax.set_ylim(-5, 5)
ax.set_ylabel('change in total RNA\n (log'r'$_{2}$'' TPM)')
ax.set_ylabel(r'$\Delta$total / $\Delta$nascent'+ '\n' + r'(log$_2$ ph RNAi / mock)')
gene_counts = comp_df['group'].value_counts()
#ax.set(xticklabels=['other'+'\n'+'genes\nn=%s' % gene_counts[False], 'polycomb'+'\n'+'targets\nn=%s' % gene_counts[True]])
ax.set(xticklabels=['non-targets\nn=%s' % gene_counts[False], 'targets\nn=%s' % gene_counts[True]])
# ax.set_xticklabels(['non-targets', 'targets'])

ax.set_xlabel('')
ax.annotate('polycomb'+'\n'+'status:', xy=(0, 0), xycoords=ax.get_xaxis_transform(),
                   xytext=(-24,-20), textcoords='offset points', ha='right')


sub_x = comp_df.loc[comp_df['group']]['deltaTN'].values
bg_x = comp_df.loc[~comp_df['group']]['deltaTN'].values

_, p = stats.mannwhitneyu(sub_x, bg_x)

h = max(get_boxtop(comp_df, col1='group', val1=True, val_col='deltaTN'),
            get_boxtop(comp_df, col1='group', val1=False, val_col='deltaTN'))


add_stars(0, 1, h+0.2, 0.1, p, ax)
print('pvalue', p)
print('median T/N pcg', 2**comp_df[comp_df['group']]['deltaTN'].median())
print('median T/N other', 2**comp_df[~comp_df['group']]['deltaTN'].median())

plt.savefig('%s.%s' % (os.path.join(outdir, 'ph_nas_tot_box'), out_fmt), dpi = out_dpi)