In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm
import git
import pdb
import sys
import copy

In [2]:
git_repo = git.Repo(".", search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
uncompressed = os.path.join(git_root, "annotated/uncompressed/")

# Column names based on annotator GitHub
columns = ["chromosome", "start", "stop", "name", "intensity", "strand", "gene_id", "gene_name", "genic_region_type", "all_overlapping_annotation"]

In [3]:
dirs = {
    'spidr': {
        # 'peaks_switched_strands_filtered': os.path.join(uncompressed, "spidr/peaks_switched_strands_filtered"),
        # 'miRNAadj': os.path.join(uncompressed, "spidr/spidr_annotated_bed_with_miRNAadj"),
        'miRNAadj': os.path.join(uncompressed, "spidr/spidr_annotated_bed_with_miRNAadj_updated")
    },
    'encode': {
        'miRNAadj': os.path.join(uncompressed, "encode/encode_annotated_bed_with_miRNAadj"),
        'downsampled_miRNAadj': os.path.join(uncompressed, "encode/downsampled_encode_annotated_bed_with_miRNAadj"),
        # 'peaks_filtered': os.path.join(uncompressed, "encode/peaks_filtered"),
        # 'downsampled_peaks_filtered': os.path.join(uncompressed, "encode/downsampled_peaks_filtered"),
    }
}
def apply_to_dict(dictionary):
    def traverse(dictionary):
        for key1 in dictionary.keys():
            for key2 in dictionary[key1].keys():
                yield key1, key2, dictionary[key1][key2]
    
    iterator = traverse(dictionary)

# TODO: Figure out how to use traverse here
files = {}
for data_type in dirs.keys():
    files[data_type] = {}
    for key in dirs[data_type].keys():
        dir = dirs[data_type][key]
        files[data_type][key] = [os.path.join(dir, file) for file in os.listdir(dir)]

In [4]:
def get_percentages(annotated_files, cols, percent_by='sum'):
    percents = []
    
    for file in tqdm(annotated_files, total=len(annotated_files)):
        # Check if file is empty
        try:
            with open(file, 'r') as f:
                if len(f.read()) == 0:
                    continue
        except UnicodeDecodeError:
            print(file)
            sys.exit(0)

        # Read in each file as a dataframe
        df = pd.read_csv(file, sep="\t", names=cols)
        basename = os.path.basename(file).replace('.txt', '')
        subset = df[["genic_region_type", "intensity"]]

        if percent_by == 'sum':
            intensities = subset.groupby(by=["genic_region_type"]).sum('intensity')
        elif percent_by == 'count':
            intensities = subset.groupby(by=["genic_region_type"]).count()

        intensities.columns = [f"{basename}"]
        total = intensities.sum().values.item()
        percent_col = (intensities / total) * 100
        percents.append(percent_col)


    percents_df = pd.concat(percents, axis=1, join='outer')
    percents_df.fillna(value=0, inplace=True)

    return percents_df

In [5]:
def rename_columns(df):
    tmp = df.copy()
    new_tmp_cols = []

    for col in tmp.columns:
        # Keep Bethyl and CST separately
        if "Bethyl" in col or "CST" in col:
            new_tmp_cols.append("_".join(col.split("_")[0:2]))
        else:
            new_col = col.split("_")[0]
            if "rep1" in new_col:
                new_col = new_col.replace("rep1", "")
            new_tmp_cols.append(new_col)

    tmp.columns = new_tmp_cols
    return new_tmp_cols, tmp

In [6]:
spidr_df = get_percentages(files['spidr']['miRNAadj'], cols=columns, percent_by='sum')
spidr_cols, spidr_df = rename_columns(spidr_df)

encode_df = get_percentages(files['encode']['miRNAadj'], cols=columns, percent_by='sum')
encode_cols, encode_df = rename_columns(encode_df)

encode_downsampled_df = get_percentages(files['encode']['downsampled_miRNAadj'], cols=columns, percent_by='sum')
encode_downsampled_cols, encode_downsampled_df = rename_columns(encode_downsampled_df)

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

In [7]:
spidr_df

Unnamed: 0_level_0,RBM15,CPSF6,FASTKD2,TIAL1,LIN28B,DGCR8,SMNDC1,KHSRP,PCBP1,HNRNPC,...,EWSR1,DDX55,HNRNPA1,FUBP3,IGF2BP2,HNRNPM,TARDBP,DHX30,NOLC1,HNRNPL_Bethyl
genic_region_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3utr,4.980312,4.139759,2.122016,16.23396,11.259038,12.821163,0.0,5.290196,0.0,1.83872,...,3.592759,81.00159,6.899885,1.539345,30.754047,1.265819,5.896734,0.0,0.268836,1.424098
5utr,2.437142,1.228236,21.118139,1.739036,0.85609,1.140202,0.0,3.520656,0.0,1.791001,...,2.809911,0.0,0.234942,0.209902,12.554283,0.82303,2.584085,0.0,0.127989,0.087561
CDS,41.275619,1.915367,40.289737,5.67632,29.217822,5.773463,0.0,36.008345,0.0,1.53254,...,34.936647,18.99841,0.978643,3.823453,35.452033,0.407857,35.348811,22.022022,4.51703,0.372609
distintron500,11.223912,63.009099,3.546215,39.506271,8.949114,8.289334,100.0,10.802864,0.0,64.10746,...,11.192239,0.0,30.212687,36.171443,0.0,49.691686,20.797422,0.0,25.228511,30.407562
distnoncoding_intron500,0.838272,4.844098,0.652928,0.785986,0.0,0.0,0.0,1.387727,0.0,2.885748,...,1.010275,0.0,4.887751,13.154075,0.0,6.63138,1.023887,0.0,2.582579,7.427688
intergenic,5.644259,9.103233,1.548664,12.809932,4.010362,2.444326,0.0,3.856452,0.0,6.209472,...,3.823555,0.0,1.945308,11.224724,0.0,30.74016,3.365021,77.977978,35.970849,51.810421
miRNA,0.327172,0.03494,0.0,1.478909,0.888556,23.957833,0.0,0.464923,0.0,0.0,...,0.249854,0.0,0.0,0.026368,0.0,0.0,0.386409,0.0,0.23377,0.0
noncoding_exon,9.49123,3.49851,8.761477,11.422291,17.588303,5.209248,0.0,12.54874,100.0,13.189583,...,18.406382,0.0,50.305865,27.222737,0.0,6.180502,3.433562,0.0,29.388456,2.050546
proxintron500,21.208684,11.11563,19.677617,5.718972,26.825697,12.038689,0.0,23.463536,0.0,6.112301,...,22.508839,0.0,3.699594,1.701056,11.014607,4.090623,25.63216,0.0,0.793651,3.087233
proxnoncoding_intron500,2.539362,1.073331,2.283208,0.361352,0.276076,0.688287,0.0,0.930664,0.0,2.328179,...,1.290255,0.0,0.466311,0.909834,0.0,0.146326,1.441206,0.0,0.888328,1.205496


In [8]:
# spidr_miRNAadj_df = get_percentages(files['spidr']['miRNAadj'], cols=columns, percent_by='sum')
# spidr_miRNAadj_cols, spidr_miRNAadj_df = rename_columns(spidr_miRNAadj_df)

## Encode Percent Counts by Region Type from Original Paper

In [19]:
# Read the excel file, skipping the first row to ensure proper column names
encode_supp_path = os.path.join(git_root, "annotated/Summary_info_encode_Suppl_Data_4.xlsx")
encode_supp_data = pd.read_excel(encode_supp_path, skiprows=1)

# Filtering for only 'K562' cell lines
encode_supp_data = encode_supp_data[encode_supp_data['Cell Line'] == 'K562']

# Get the list of gene symbols
gene_symb = encode_supp_data[['Official Gene Symbol']]

# Get all columns corresponding to total counts and counts of subsets (e.g. CDS, miRNA, etc)
region_counts = encode_supp_data[encode_supp_data.columns[-17:].tolist()]

# Merge the dataframes by index
raw_counts = gene_symb.join(region_counts)
raw_counts.set_index('Official Gene Symbol', inplace=True)
raw_counts.to_csv(os.path.join(git_root, "output", "encode_supp_raw_counts_by_region.csv"))

# Divide subset counts by total
percent_counts = raw_counts[raw_counts.columns[1:]].div(raw_counts['IDR peak #'], axis=0) * 100

# Transpose the dataframe so it's in the same shape as encode and spidr along with corresponding columns
encode_supp = percent_counts.T

# Manually renaming things in supplementary data to match annotator output
supp_to_annot = {
    "distintron": "distintron500",
    "noncoding_distintron": "distnoncoding_intron500",
    "proxintron": "proxintron500",
    "noncoding_proxintron": "proxnoncoding_intron500",
    # "miRNA_proximal": "miRNA_adjacent"
}

new_index = {}

for idx in encode_supp.index:
    if idx in supp_to_annot:
        new_index[idx] = supp_to_annot[idx]
    else:
        new_index[idx] = idx

encode_tmp = encode_supp.copy()

encode_tmp = encode_tmp.rename(index=new_index)

In [20]:
dfs = {
    "spidr": spidr_df,
    # "spidr_miRNAadj": spidr_miRNAadj_df,
    "encode": encode_df,
    "encode_downsampled": encode_downsampled_df,
    "encode_supp": encode_tmp
}

final = {key: None for key in dfs.keys()}

In [21]:
def get_common(dataframe_dict):
    cols = []
    indices = []
    
    for df in dataframe_dict.values():
        cols.append(set(df.columns))
        indices.append(set(df.index))
    
    common_cols = list(cols[0].intersection(*cols[1:]))
    common_indices = list(indices[0].intersection(*indices[1:]))
    return common_cols, common_indices

In [22]:
common_cols, common_idx = get_common(dfs)
output_root = os.path.join(git_root, "output")

for key, df in dfs.items():
    # Keep common columns
    tmp = df.copy()
    tmp = tmp[common_cols]

    # Keep only region names that are common to all dataframes
    tmp = tmp.filter(items=common_idx, axis=0)
    
    # Remove duplicated columns
    tmp = tmp.loc[:, ~tmp.columns.duplicated()]

    # Save to disk
    final[key] = tmp
    final[key].to_csv(os.path.join(output_root, f"{key}_percent_by_region.csv"))

Annotator <--> Supplementary

- disintron500 <--> disintron 
- distnoncoding_intron500 <--> noncoding_disintron
- proxintron500 <--> proxintron
- proxnoncoding_intron500 <--> noncoding_proxintron

Fuse miRNA and miRNA_proximal