### Motifs mir sites
- Look at the transcript targets identified by Targetscan Fly v7.2

In [None]:
import pandas as pd
import gffutils
import os
import sys
sys.path.append('../scripts/')
from annotation_utilities import *
from plot_helpers import *
from scipy.stats import hypergeom
from statsmodels.stats.multitest import fdrcorrection
# from plotting_fxns import update_old_ids

outdir = '../Figures/Motifs'
os.makedirs(outdir, exist_ok = True)

%load_ext autoreload
%autoreload 2

#### Extract miRNA target predictions by gene
- Use the miRNAs and context scores and context score percentiles from the "Conserved_Site_Context_Scores" file
- Add the representative transcript and the PCT values from the "Conserved_Family_Info" file
- Convert the IDs from 6.19 -> 6.28

In [None]:
# They used Flybase 6.19 = 2017_06
gtf_file = '../../resources/other_studies/TargetScan/dmel-all-r6.19.gtf'
db_out = '../../resources/other_studies/TargetScan/dmel_619.db'
if not os.path.exists(db_out):
    db = gffutils.create_db(gtf_file, db_out, disable_infer_genes = True, disable_infer_transcripts = True,
    force = True, merge_strategy = 'create_unique')
else:
    db = gffutils.FeatureDB(db_out)

In [None]:
# Load TargetScan 7.2 Drosophila predictions, using representative transcripts for 6.19
indir = '../../resources/other_studies/TargetScan/'
cons_context_score_file = os.path.join(indir, 'Conserved_Site_Context_Scores.txt')
cons_family_file = os.path.join(indir, 'Conserved_Family_Info.txt')

# Family df has the representative txts and PCT values in it
# context_score_df has the context scores and percentiles in it
context_score_df = pd.read_csv(cons_context_score_file, sep='\t').rename(columns={'UTR_start':'UTR start'})
family_df = pd.read_csv(cons_family_file, sep='\t').query('`Species ID` == 7227')

In [None]:
# Focus on representative transcripts
rep_txts = set(family_df['Transcript ID'].values)
context_score_df = context_score_df.query('`Transcript ID` in @rep_txts').copy()

In [None]:
# The PCT will be the same for all the families at the same site in the UTR because the sequences are the same
family_df2 = family_df.drop_duplicates(subset=['Transcript ID', 'UTR start', 'UTR end'])
# merge the PCT scores into the context score df
df2 = pd.merge(context_score_df, family_df2[['Transcript ID', 'UTR start', 'UTR end', 'PCT']], left_on=['Transcript ID', 'UTR start', 'UTR end'], right_on=['Transcript ID', 'UTR start', 'UTR end'], how='left', indicator=True)
context_score_only = len(df2.query('_merge == "left_only"'))
both = len(df2.query('_merge == "both"'))
print(f'% of context scores with no PCT {context_score_only*100/(context_score_only + both)}')
# Some sites only have context score and no PCT. I assume this is because the PCT could not be calculated for these sites
df2 = df2.query('_merge == "both"').copy()

In [None]:
# Create the ID mapping table. The index is the old ID and the columns contain the newID
id_dir = '../../resources/id_conversion/'
# I don't know the annotation version used, so try to convert using the current ones
dmel619_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2017_06.tsv')
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')
cdf = update_ids(dmel628_file, dmel619_file, id_type='FB', genes=df2['Gene ID'].tolist())
cdf.rename(columns={'new_ID':'gene_ID_628'}, inplace=True)
df3 = pd.merge(df2, cdf[['gene_ID_628']], left_on='Gene ID', right_index=True)

In [None]:
# Get the PCT percentile by miRNA
df3['PCT_percentile'] = df3.groupby('miRNA')['PCT'].transform('rank', pct=True)*100

In [None]:
# Check that lower context score => higher percentile => better predicted targeting
fig = plt.figure(figsize=(dfig,dfig))
ax = fig.add_subplot(111)
ax.scatter(df3['weighted context++ score'], df3['weighted context++ score percentile'], color='k', alpha=0.1, s=1)
ax.set_ylabel('percentile')
ax.set_xlabel('weighted context++ score')
# Check that high PCT => high percentile => better predicted targeting
fig = plt.figure(figsize=(dfig,dfig))
ax = fig.add_subplot(111)
ax.scatter(df3['PCT'], df3['PCT_percentile'], color='k', alpha=0.1, s=1)
ax.set_ylabel('percentile')
ax.set_xlabel('PCT')


#### Find enrichment of miRNA sites in the TF RNAs
- For each miRNA family: count occurences in bg group and subset (x/bg) vs. (y/subset)
- Calculate enrichment with hypergeometric test
- perform BH correction

In [None]:
bg_genes = pd.read_csv(os.path.join(outdir,'bg_genes.csv'))['gene']
subset_genes = pd.read_csv(os.path.join(outdir, 'CTS_TF_genes.csv'))['gene']

In [None]:
def score_mirs(df):
    '''count mir hits by gene'''
    mirs = df['miRNA'].unique()
    mir_dict = {}
    for mir in mirs:
        mir_dict[mir] = {}
        group_counts = len(df.query('miRNA == @mir&gene_ID_628 in @subset_genes'))
        bg_counts = len(df.query('miRNA == @mir&gene_ID_628 in @bg_genes'))
        mir_dict[mir]['subset'] = group_counts
        mir_dict[mir]['bg'] = bg_counts

    mir_df = pd.DataFrame.from_dict(mir_dict, orient='index')
    N_bg = len(bg_genes)
    N_subset = len(subset_genes)
    mir_df['pval'] = mir_df.apply(lambda x: hypergeom.sf(x['subset']-1, N_bg, x['bg'], N_subset), axis=1)
    rejected, p_adj = fdrcorrection(mir_df['pval'])
    mir_df['pval_bh'] = p_adj
    mir_df.sort_values(by='pval_bh', ascending=True, inplace=True)
    return mir_df

In [None]:
# Limit search to top 50% percentile for both PCT and context score
df_50 = df3.query('PCT_percentile > 50&`weighted context++ score percentile` > 50').copy()

In [None]:
mir_df_50 = score_mirs(df_50)

In [None]:
mir_df_50.head(n=20)

In [None]:
# Output a .gmt file to do GSEA analysis
# Need targets for each miRNA set
def write_mir_gmt(df, outname):
    ''']
    count mir hits by gene and write a .gmt file for GSEA
    '''
    mirs = df['miRNA'].unique()
    mir_dict = {}
    for mir in mirs:
        mir_dict[mir] = df.query('miRNA == @mir&gene_ID_628 in @bg_genes')['Gene ID'].unique()

    outfile = f'{outname}.gmt'
    with open(outfile, 'w') as g:
        for m in mir_dict:
            setnames = f'{m}\tna'
            genes = '\t'.join(mir_dict[m])
            line = f'{setnames}\t{genes}\n'
            g.write(line)

In [None]:
outname = os.path.join(outdir, 'miRNAs')
mir_dict = write_mir_gmt(df_50, outname)