In [160]:
import os
import pandas as pd
from tqdm.notebook import tqdm
import git
import pdb

git_repo = git.Repo(".", search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")

## Encode Percent Counts by Region Type from Annotator

In [161]:
# List of annotatated TSV files produced by annotator
spidr_dir = os.path.join(git_root, "annotated/uncompressed/spidr-annotated")
encode_dir = os.path.join(git_root, "annotated/uncompressed/encode-annotated")
spidr_outputs = [os.path.join(spidr_dir, file) for file in os.listdir(spidr_dir)]
encode_outputs = [os.path.join(encode_dir, file) for file in os.listdir(encode_dir)]
all_outputs = spidr_outputs + encode_outputs

# Column names based on annotator GitHub
columns = ["chromosome", "start", "stop", "name", "intensity", "strand", "gene_id", "gene_name", "genic_region_type", "all_overlapping_annotation"]

# List to store percent_by_type dataframes
percent_by_count = []
percent_by_sum = []
raw_sum = []
raw_count = []

for file in tqdm(all_outputs, total=len(all_outputs)):
    # Read in each file as a dataframe
    df = pd.read_csv(file, sep="\t")
    df.columns = columns
    basename = os.path.basename(file).replace('.txt', '')
    subset = df[["genic_region_type", "intensity"]]

    # Count by each region type and turn that into a percentage
    sum_intensities = subset.groupby(by=["genic_region_type"]).sum('intensity')
    sum_intensities.columns = [f"{basename}"]
    raw_sum.append(sum_intensities)
    
    # Count by each region type and turn that into a percentage
    count_intensities = subset.groupby(by=["genic_region_type"]).count()
    count_intensities.columns = [f"{basename}"]
    raw_count.append(count_intensities)
    
    # Getting percentages based on counts
    total = count_intensities.sum().values.item()
    percent = (count_intensities / total) * 100
    percent_by_count.append(percent)
    
    # Getting percentages based on sums
    total = sum_intensities.sum().values.item()
    percent = (sum_intensities / total) * 100
    percent_by_sum.append(percent)

raw_count_df = pd.concat(raw_count, axis=1, join='outer')
raw_count_df.fillna(value=0, inplace=True)

raw_sum_df = pd.concat(raw_sum, axis=1, join='outer')
raw_sum_df.fillna(value=0, inplace=True)

percent_by_count_df = pd.concat(percent_by_count, axis=1, join='outer')
percent_by_count_df.fillna(value=0, inplace=True)

percent_by_sum_df = pd.concat(percent_by_sum, axis=1, join='outer')
percent_by_sum_df.fillna(value=0, inplace=True)

  0%|          | 0/82 [00:00<?, ?it/s]

In [162]:
# Keep Bethyl and CST separately
def get_filtered_cols(df):
    tmp = df.copy()
    new_tmp_cols = []
    for col in tmp.columns:
        if "Bethyl" in col or "CST" in col:
            new_tmp_cols.append("_".join(col.split("_")[0:2]))
        else:
            new_tmp_cols.append(col.split("_")[0])

    tmp.columns = new_tmp_cols
    return new_tmp_cols, tmp

### Percentages by Summing Intensities

In [163]:
spidr = percent_by_sum_df[sorted([col for col in percent_by_sum_df.columns.tolist() if "_spidr_" in col])]
encode = percent_by_sum_df[sorted([col for col in percent_by_sum_df.columns.tolist() if "_encode_" in col])]

new_spidr_cols, spidr = get_filtered_cols(spidr)
new_encode_cols, encode = get_filtered_cols(encode)

common_cols = [col for col in new_encode_cols if col in new_spidr_cols]
encode_percentage_by_int_sum = encode[common_cols]
spidr_percentage_by_int_sum = spidr[common_cols]

spidr_percentage_by_int_sum.to_csv(os.path.join(git_root, "output", "spidr_percent_by_intensity_sum.csv"))
encode_percentage_by_int_sum.to_csv(os.path.join(git_root, "output", "encode_percent_by_intensity_sum.csv"))

### Percentage by Counting Rows

In [164]:
spidr = percent_by_count_df[sorted([col for col in percent_by_count_df.columns.tolist() if "_spidr_" in col])]
encode = percent_by_count_df[sorted([col for col in percent_by_count_df.columns.tolist() if "_encode_" in col])]

new_spidr_cols, spidr = get_filtered_cols(spidr)
new_encode_cols, encode = get_filtered_cols(encode)

common_cols = [col for col in new_encode_cols if col in new_spidr_cols]
raw_encode_percentage_by_row_count = encode[common_cols]
raw_spidr_percentage_by_row_count = spidr[common_cols]

raw_encode_percentage_by_row_count.to_csv(os.path.join(git_root, "output", "encode_percent_by_row_count.csv"))
raw_spidr_percentage_by_row_count.to_csv(os.path.join(git_root, "output", "spidr_percent_by_row_count.csv"))

## Encode Percent Counts by Region Type from Original Paper

In [165]:
# Read the excel file, skipping the first row to ensure proper column names
encode_supp_path = os.path.join(git_root, "annotated/Summary_info_encode_Suppl_Data_4.xlsx")
encode_supp_data = pd.read_excel(encode_supp_path, skiprows=1)

# Filtering for only 'K562' cell lines
encode_supp_data = encode_supp_data[encode_supp_data['Cell Line'] == 'K562']

# Get the list of gene symbols
gene_symb = encode_supp_data[['Official Gene Symbol']]

# Get all columns corresponding to total counts and counts of subsets (e.g. CDS, miRNA, etc)
region_counts = encode_supp_data[encode_supp_data.columns[-17:].tolist()]

# Merge the dataframes by index
raw_counts = gene_symb.join(region_counts)
raw_counts.set_index('Official Gene Symbol', inplace=True)
raw_counts.to_csv(os.path.join(git_root, "output", "encode_supp_raw_counts_by_region.csv"))

# Divide subset counts by total
percent_counts = raw_counts[raw_counts.columns[1:]].div(raw_counts['IDR peak #'], axis=0) * 100

# Transpose the dataframe so it's in the same shape as encode and spidr along with corresponding columns
encode_supp = percent_counts.T[common_cols]

In [166]:
spidr = spidr_percentage_by_int_sum
encode = encode_percentage_by_int_sum

# Get indices for all 3 dataframes
encode_supp_index = set(encode_supp.index.tolist())
encode_index = set(encode.index.tolist())
spidr_index = set(spidr.index.tolist())

# Find region names shared by all 3 dataframes
# common_idx = encode_supp_index.intersection(encode_index).intersection(spidr_index)
common_idx = encode_index.intersection(spidr_index)

# Keep only region names that are common to all dataframes
# encode_supp = encode_supp.filter(items=common_idx, axis=0)
encode = encode.filter(items=common_idx, axis=0)
spidr = spidr.filter(items=common_idx, axis=0)

In [167]:
# Remove duplicated columns by name (as opposed to by matching values)
# encode_supp = encode_supp.loc[:, ~encode_supp.columns.duplicated()]
encode = encode.loc[:, ~encode.columns.duplicated()]
spidr = spidr.loc[:, ~spidr.columns.duplicated()]

In [168]:
output_root = os.path.join(git_root, "output")

# encode_supp.to_csv(os.path.join(output_root, "encode_supp_percent_by_region.csv"))
encode.to_csv(os.path.join(output_root, "encode_percent_by_region.csv"))
spidr.to_csv(os.path.join(output_root, "spidr_percent_by_region.csv"))