In [56]:
import pandas as pd
import os, sys
import requests
import pyBigWig
from Bio.Seq import reverse_complement
import numpy as np
import collections

In [95]:
# import prediction result 
df = pd.DataFrame()

for fn in os.listdir('OUTPUT'):
    miR = fn.split('_')[0]
    
    _df = pd.read_csv(f'OUTPUT/{fn}', sep = '\t')
    
    _df['miR'] = miR
    df = pd.concat([df, _df])
    
df = df[df['PUM_count']!=0]
df = df.astype({'phyloP_avg': float})

# convert dataframe to show miR and PUM sites for each gene
def join_miR(miRs):
    return ','.join(list(miRs.unique()))

df = df.groupby('gene').agg({'miR': join_miR, 'PUM_count': 'first', 'phyloP_avg': 'mean'}).reset_index()

# import and merge translation efficiency data with prediction
TE = pd.read_excel(
    'translation_analysis_mouse_dev.xls', 
    sheet_name = 3, 
    header = 1
)

df = df.merge(TE, left_on = df['gene'].str.upper(), right_on = TE['Gene'].str.upper(), how = 'inner')
df = df.drop(columns = 'key_0')

# import and merge RIP-ChIP data with prediction
PUM = pd.read_csv('mouse_PUM_target.csv')
df = df.merge(PUM, left_on = df['gene'].str.upper(), right_on = PUM['SYMBOL'].str.upper())
df = df.drop(columns = 'key_0')
              
df = df.dropna()

# filter dataframe by average phyloP score (>2) and fold change of translation efficiency (2 fold or more)
df = df[(df['phyloP_avg']>2)
        &((df['TE']>1)|(df['TE']<-1))
       ]

# save result
df.to_excel('PUM_miRNA_prediction_result.xlsx')