In [10]:
import os
import pandas as pd
from tqdm.notebook import tqdm
import git
import pdb

git_repo = git.Repo(".", search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")

## Encode Percent Counts by Region Type from Annotator

In [19]:
# List of annotatated TSV files produced by annotator
spidr_dir = os.path.join(git_root, "annotated/uncompressed/spidr-annotated")
encode_dir = os.path.join(git_root, "annotated/uncompressed/encode-annotated")
spidr_outputs = [os.path.join(spidr_dir, file) for file in os.listdir(spidr_dir)]
encode_outputs = [os.path.join(encode_dir, file) for file in os.listdir(encode_dir)]
all_outputs = spidr_outputs + encode_outputs

# Column names based on annotator GitHub
columns = ["chromosome", "start", "stop", "name", "intensity", "strand", "gene_id", "gene_name", "genic_region_type", "all_overlapping_annotation"]

# List to store percent_by_type dataframes
pbt = []
raw = []

for file in tqdm(all_outputs, total=len(all_outputs)):
    # Read in each file as a dataframe
    df = pd.read_csv(file, sep="\t")
    df.columns = columns

    # Count by each region type and turn that into a percentage
    counts_by_type = df[["gene_id", "genic_region_type"]].groupby(by="genic_region_type").count()
    counts_by_type.columns = [f"{basename}"]
    raw.append(counts_by_type)
    
    
    total_count = counts_by_type.sum().values.item()
    percent_by_type = (counts_by_type / total_count) * 100
    
    # Change the column to the file name without file extension for easier
    basename = os.path.basename(file).replace('.txt', '')
    pbt.append(percent_by_type)

raw_counts = pd.concat(raw, axis=1, join='outer')
raw_counts.fillna(value=0, inplace=True)
percent_counts = pd.concat(pbt, axis=1, join='outer')
percent_counts.fillna(value=0, inplace=True)

  0%|          | 0/82 [00:00<?, ?it/s]

In [28]:
def get_filtered_cols(df):
    tmp = df.copy()
    new_tmp_cols = []
    for col in tmp.columns:
        if "Bethyl" in col or "CST" in col:
            new_tmp_cols.append("_".join(col.split("_")[0:2]))
        else:
            new_tmp_cols.append(col.split("_")[0])

    tmp.columns = new_tmp_cols
    return tmp.columns

### Percentage of Counts

In [26]:
spidr = percent_counts[sorted([col for col in percent_counts.columns.tolist() if "_spidr_" in col])]
encode = percent_counts[sorted([col for col in percent_counts.columns.tolist() if "_encode_" in col])]

new_spidr_cols = get_filtered_cols(spidr)
new_encode_cols = get_filtered_cols(encode)
common_cols = [col for col in new_encode_cols if col in new_spidr_cols]

percent_encode = encode[common_cols]
percent_spidr = spidr[common_cols]

AssertionError: 

### Raw Counts

In [23]:
spidr = raw_counts[sorted([col for col in raw_counts.columns.tolist() if "_spidr_" in col])]
encode = raw_counts[sorted([col for col in raw_counts.columns.tolist() if "_encode_" in col])]

new_spidr_cols = get_filtered_cols(spidr)
new_encode_cols = get_filtered_cols(encode)
common_cols = [col for col in new_encode_cols if col in new_spidr_cols]

raw_encode = encode[common_cols]
raw_spidr = spidr[common_cols]

raw_encode.to_csv(os.path.join(git_root, "output", "encode_raw_counts_by_region.csv"))
raw_spidr.to_csv(os.path.join(git_root, "output", "spidr_raw_counts_by_region.csv"))

## Encode Percent Counts by Region Type from Original Paper

In [6]:
# Read the excel file, skipping the first row to ensure proper column names
encode_supp_path = os.path.join(git_root, "annotated/Summary_info_encode_Suppl_Data_4.xlsx")
encode_supp_data = pd.read_excel(encode_supp_path, skiprows=1)

# Filtering for only 'K562' cell lines
encode_supp_data = encode_supp_data[encode_supp_data['Cell Line'] == 'K562']

# Get the list of gene symbols
gene_symb = encode_supp_data[['Official Gene Symbol']]

# Get all columns corresponding to total counts and counts of subsets (e.g. CDS, miRNA, etc)
region_counts = encode_supp_data[encode_supp_data.columns[-17:].tolist()]

# Merge the dataframes by index
raw_counts = gene_symb.join(region_counts)
raw_counts.set_index('Official Gene Symbol', inplace=True)
raw_counts.to_csv(os.path.join(git_root, "output", "encode_supp_raw_counts_by_region.csv"))

# Divide subset counts by total
percent_counts = raw_counts[raw_counts.columns[1:]].div(raw_counts['IDR peak #'], axis=0) * 100

# Transpose the dataframe so it's in the same shape as encode and spidr along with corresponding columns
encode_supp = percent_counts.T[common_cols]

In [8]:
# Get indices for all 3 dataframes
encode_supp_index = set(encode_supp.index.tolist())
encode_index = set(encode.index.tolist())
spidr_index = set(spidr.index.tolist())

# Find region names shared by all 3 dataframes
common_idx = encode_supp_index.intersection(encode_index).intersection(spidr_index)

# Keep only region names that are common to all dataframes
encode_supp = encode_supp.filter(items=common_idx, axis=0)
encode = encode.filter(items=common_idx, axis=0)
spidr = spidr.filter(items=common_idx, axis=0)

In [9]:
# Remove duplicated columns by name (as opposed to by matching values)
encode_supp = encode_supp.loc[:, ~encode_supp.columns.duplicated()]
encode = encode.loc[:, ~encode.columns.duplicated()]
spidr = spidr.loc[:, ~spidr.columns.duplicated()]

In [10]:
spidr

Unnamed: 0_level_0,BUD13,DGCR8,DROSHA,EWSR1,FASTKD2,FUS,HNRNPA1,HNRNPC,HNRNPK,HNRNPM,...,RBM15,RPS3,SAFB,SLBP,SSB,TAF15,TARDBP,TRA2A,U2AF1,UPF1
genic_region_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3utr,3.090004,9.638554,2.707581,4.536125,2.898551,4.597701,3.981623,2.104556,3.680982,1.371541,...,5.377629,1.851852,3.703704,5.428571,0.0,3.647652,5.247266,2.442635,3.432398,66.490623
noncoding_exon,3.359602,24.698795,22.563177,4.064039,10.869565,43.678161,18.836141,2.384417,9.202454,2.473671,...,6.692161,29.62963,31.17284,4.571429,7.070707,10.32208,2.804804,2.331606,5.064881,2.425092
CDS,48.257984,8.433735,7.761733,38.642583,44.927536,28.735632,1.531394,1.164223,5.521472,0.416361,...,42.184512,37.037037,2.469136,12.857143,5.050505,31.703531,36.469167,39.881569,44.746756,7.415391
5utr,3.442555,0.60241,1.444043,3.243021,10.869565,3.448276,0.612557,1.097056,4.294479,1.028655,...,2.581262,0.0,0.0,0.285714,0.0,2.405898,2.429041,1.931902,3.641691,0.635913
miRNA,0.414766,18.674699,7.039711,0.253147,0.0,0.0,0.0,0.0,0.0,0.0,...,0.310707,0.0,0.0,0.0,0.0,0.426853,0.355633,0.170244,0.376727,0.043113


In [11]:
encode

Unnamed: 0_level_0,BUD13,DGCR8,DROSHA,EWSR1,FASTKD2,FUS,HNRNPA1,HNRNPC,HNRNPK,HNRNPM,...,RBM15,RPS3,SAFB,SLBP,SSB,TAF15,TARDBP,TRA2A,U2AF1,UPF1
genic_region_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3utr,0.261438,0.4329,0.0,0.604824,0.0,0.0,0.0,2.439024,0.0,0.0,...,0.136426,0.0,0.0,0.0,0.0,0.432277,0.201207,0.184472,0.858034,1.443751
noncoding_exon,1.339869,6.277056,4.395604,2.480137,33.333333,20.430108,60.0,53.658537,1.754386,1.449275,...,8.049113,0.0,31.034483,2.325581,0.282486,11.67147,0.819201,1.405241,2.652106,4.088704
CDS,0.098039,0.865801,0.0,2.19383,0.0,0.0,0.0,0.0,0.0,0.724638,...,0.545703,0.0,0.0,0.0,0.0,0.0,0.0,0.059682,0.546022,0.508201
5utr,0.163399,0.0,0.0,0.422303,0.0,0.0,0.0,0.0,0.584795,0.0,...,0.136426,0.0,0.0,0.0,0.0,0.0,0.014372,0.1058,0.702028,0.01155
miRNA,0.03268,0.649351,0.0,0.03221,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008138,0.234009,0.0


In [12]:
encode_supp

Official Gene Symbol,BUD13,DGCR8,DROSHA,EWSR1,FASTKD2,FUS,HNRNPA1,HNRNPC,HNRNPK,HNRNPM,...,RBM15,RPS3,SAFB,SLBP,SSB,TAF15,TARDBP,TRA2A,U2AF1,UPF1
3utr,1.15669,2.907916,2.565418,1.067818,8.099174,4.757484,8.433735,22.621185,9.440789,1.763971,...,9.626359,3.17063,0.988142,35.031847,0.613497,1.426025,10.715333,0.34662,6.666667,91.027357
noncoding_exon,2.039113,5.654281,4.797332,4.32807,10.743802,10.211186,9.036145,7.181329,5.855263,3.122073,...,4.404006,0.500626,23.12253,7.006369,51.533742,13.190731,4.544122,9.358752,5.609756,1.843592
CDS,67.505366,7.26979,12.057465,1.590367,7.438017,4.061267,2.409639,1.436266,13.980263,1.71714,...,64.2188,88.610763,15.217391,50.955414,7.361963,1.604278,2.154793,86.481802,59.105691,3.560809
5utr,3.195803,4.846527,11.775269,3.86232,10.578512,3.968438,3.614458,2.154399,4.243421,3.356229,...,9.809411,6.591573,0.988142,0.636943,3.067485,2.673797,1.436529,1.213172,3.98374,0.460898
miRNA,0.238493,28.109855,2.975885,0.068159,1.983471,0.116036,0.0,0.179533,0.0,0.01561,...,0.129213,0.041719,0.0,0.0,0.613497,0.178253,0.029317,0.05777,0.081301,0.044603


In [13]:
output_root = os.path.join(git_root, "output")

encode_supp.to_csv(os.path.join(output_root, "encode_supp_percent_by_region.csv"))
encode.to_csv(os.path.join(output_root, "encode_percent_by_region.csv"))
spidr.to_csv(os.path.join(output_root, "spidr_percent_by_region.csv"))