In [11]:
import pandas as pd
from scipy.cluster.hierarchy import fcluster, linkage
import scipy.cluster.hierarchy as sch
import os
import matplotlib.pyplot as plt
import pybedtools

In [2]:
directory1 = 'output/'
protein_file = 'protein_list.txt'
outfile_prefix = 'SPIDR_filtered_annotation_miRNA_adj'

In [3]:
annotations1 = []

for file in os.listdir(directory1):
    annotations1.append(file)

In [4]:
set1_annotations = []
for i in annotations1:
    annotations_file=directory1+i

    with open(annotations_file) as file:
        annotations_df = pd.read_csv(file, header=None, sep='\t')
        annotations_df.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'count','strand', 'ens', 'gene', 'annotation', 'other']
        set1_annotations.append(annotations_df)

In [5]:
updated = []
for i in set1_annotations:
    for ind in i.index:
        gene = i.at[int(ind), 'gene']
        annotation = i.at[int(ind), 'annotation']
        if 'MIR' in gene and 'miRNA' not in annotation:
            i.at[int(ind), 'annotation'] = 'miRNA_adjacent'
    
    updated.append(i)

In [None]:
for i in range(len(updated)):
    prefix = annotations1[i][:-4]
    filename = prefix + '.bed'
    pybedtools.BedTool.from_dataframe(updated[i]).saveas(filename)

In [31]:
set1_annotations = updated

In [32]:
# get absolute peak counts for each annotation
annotation_dict_list1 = []

for i in set1_annotations:
    annotation_dict = {}

    for ind in i.index:
        annotation = i['annotation'][ind]
        if annotation in annotation_dict.keys():
            annotation_dict[annotation] = annotation_dict[annotation] + 1
        else:
            annotation_dict[annotation] = 1
    
    annotation_dict_list1.append(annotation_dict)

In [39]:
# get all keys
all_keys = []
for i in annotation_dict_list1:
    for key in i.keys():
        if key not in all_keys:
            all_keys.append(key)

all_keys = sorted(all_keys)
all_keys

['3utr',
 '5utr',
 'CDS',
 'distintron500',
 'distnoncoding_intron500',
 'intergenic',
 'miRNA',
 'miRNA_adjacent',
 'noncoding_exon',
 'proxintron500',
 'proxnoncoding_intron500',
 'stop_codon']

In [40]:
# add values to each dictionary for each annotation
for i in annotation_dict_list1:
    for category in all_keys:
        if category not in i.keys():
            i[category] = 0
            
# convert dictionary
annotation_summary = pd.DataFrame(annotation_dict_list1)
annotation_summary = annotation_summary.transpose()
annotation_summary.columns = annotations1

In [41]:
annotation_summary

Unnamed: 0,HNRNPL_Bethyl_spidr_annotation.txt,PTBP1_spidr_annotation.txt,HNRNPC_spidr_annotation.txt,WDR43_spidr_annotation.txt,PCBP2_spidr_annotation.txt,HNRNPL_CST_spidr_annotation.txt,SSB_spidr_annotation.txt,FASTKD2_spidr_annotation.txt,IGF2BP1_spidr_annotation.txt,SLBP_spidr_annotation.txt,...,IGF2BP2_spidr_annotation.txt,HuR_spidr_annotation.txt,LSM11_spidr_annotation.txt,ILF3_spidr_annotation.txt,FUBP3_spidr_annotation.txt,SRSF9_spidr_annotation.txt,LIN28B_spidr_annotation.txt,ADAR1_spidr_annotation.txt,EWSR1_spidr_annotation.txt,RPS3_spidr_annotation.txt
noncoding_exon,122,852,213,11,38,76,7,15,117,16,...,0,105,2,286,3145,97,31,158,578,16
distnoncoding_intron500,622,8171,330,1,41,609,4,1,18,31,...,0,349,0,2022,15947,2,0,393,214,0
distintron500,1184,42145,6513,0,272,2413,3,6,1138,0,...,0,2373,61,13577,47845,2352,94,153,2269,1
intergenic,3800,7298,764,282,49,1993,72,2,270,234,...,0,771,41,1715,15605,12,40,140,688,16
proxintron500,143,5250,595,0,131,331,4,30,318,0,...,1,369,6,1255,3141,43,88,10,3897,1
CDS,24,977,104,0,162,130,5,63,386,45,...,3,131,0,163,4656,15,159,368,5648,20
3utr,63,2352,188,0,193,161,0,4,275,19,...,3,1062,2,835,2319,4,81,86,663,1
proxnoncoding_intron500,86,579,128,2,8,68,4,3,1,5,...,0,49,0,138,1022,1,5,13,129,0
5utr,6,412,98,0,56,20,0,15,33,1,...,1,87,0,80,314,7,9,9,474,0
stop_codon,1,6,0,0,2,0,0,0,1,0,...,0,1,0,2,10,0,1,2,3,0


In [42]:
# remove intergenic 
annotation_summary = annotation_summary.drop(['intergenic'])
annotation_filename = outfile_prefix + '_counts.csv'
annotation_summary.to_csv(annotation_filename)
# get proportion 

proportion_summary = annotation_summary

for i in annotations1:
    colsum = proportion_summary[i].sum()
    proportion_summary[i] = proportion_summary[i].div(colsum)
    proportion_summary[i] = proportion_summary[i].multiply(100).round(2)

In [43]:
proportion_summary

Unnamed: 0,HNRNPL_Bethyl_spidr_annotation.txt,PTBP1_spidr_annotation.txt,HNRNPC_spidr_annotation.txt,WDR43_spidr_annotation.txt,PCBP2_spidr_annotation.txt,HNRNPL_CST_spidr_annotation.txt,SSB_spidr_annotation.txt,FASTKD2_spidr_annotation.txt,IGF2BP1_spidr_annotation.txt,SLBP_spidr_annotation.txt,...,IGF2BP2_spidr_annotation.txt,HuR_spidr_annotation.txt,LSM11_spidr_annotation.txt,ILF3_spidr_annotation.txt,FUBP3_spidr_annotation.txt,SRSF9_spidr_annotation.txt,LIN28B_spidr_annotation.txt,ADAR1_spidr_annotation.txt,EWSR1_spidr_annotation.txt,RPS3_spidr_annotation.txt
noncoding_exon,5.42,1.4,2.61,78.57,4.15,1.99,25.0,10.95,5.1,13.68,...,0.0,2.32,2.82,1.56,3.94,3.85,6.55,13.26,4.15,41.03
distnoncoding_intron500,27.62,13.42,4.04,7.14,4.48,15.98,14.29,0.73,0.79,26.5,...,0.0,7.7,0.0,11.01,20.0,0.08,0.0,32.97,1.54,0.0
distintron500,52.58,69.22,79.72,0.0,29.69,63.32,10.71,4.38,49.65,0.0,...,0.0,52.33,85.92,73.9,60.01,93.26,19.87,12.84,16.29,2.56
proxintron500,6.35,8.62,7.28,0.0,14.3,8.69,14.29,21.9,13.87,0.0,...,11.11,8.14,8.45,6.83,3.94,1.7,18.6,0.84,27.98,2.56
CDS,1.07,1.6,1.27,0.0,17.69,3.41,17.86,45.99,16.84,38.46,...,33.33,2.89,0.0,0.89,5.84,0.59,33.62,30.87,40.55,51.28
3utr,2.8,3.86,2.3,0.0,21.07,4.22,0.0,2.92,12.0,16.24,...,33.33,23.42,2.82,4.55,2.91,0.16,17.12,7.21,4.76,2.56
proxnoncoding_intron500,3.82,0.95,1.57,14.29,0.87,1.78,14.29,2.19,0.04,4.27,...,0.0,1.08,0.0,0.75,1.28,0.04,1.06,1.09,0.93,0.0
5utr,0.27,0.68,1.2,0.0,6.11,0.52,0.0,10.95,1.44,0.85,...,11.11,1.92,0.0,0.44,0.39,0.28,1.9,0.76,3.4,0.0
stop_codon,0.04,0.01,0.0,0.0,0.22,0.0,0.0,0.0,0.04,0.0,...,0.0,0.02,0.0,0.01,0.01,0.0,0.21,0.17,0.02,0.0
miRNA_adjacent,0.04,0.21,0.01,0.0,1.31,0.08,3.57,0.0,0.09,0.0,...,11.11,0.11,0.0,0.04,1.66,0.0,0.0,0.0,0.11,0.0


In [44]:
proportion_filename = outfile_prefix + '_percentage.csv'
proportion_summary.to_csv(proportion_filename)