In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import joblib

ModuleNotFoundError: No module named 'seaborn'

In [None]:
data = pd.read_csv("../data/region_stats.txt", sep='\t')
orig_sources = ['baylor', 'giab', 'pacbio', 'ucsd1', 'ucsd2']
data.head()

In [None]:
# Input bed files' stats
m_fontsz = 10
fix, axs = plt.subplots(2, 2, dpi=180)

plt.sca(axs[0][0])
p = sb.barplot(data=data, x="source", y="count", order=orig_sources)
p.set_title("Input count", fontsize=m_fontsz)
hide = p.set(ylabel="Count")

plt.sca(axs[0][1])
p = sb.barplot(data=data, x="source", y="tot_len", order=orig_sources)
p.set_title("Input span (bp)", fontsize=m_fontsz)
hide = p.set(ylabel="Total Span")

# Input merged bed files' stats
plt.sca(axs[1][0])
data['kept_count'] = data['total'] - data['removed']
p = sb.barplot(data=data, x="source", y="kept_count", order=orig_sources)
p.set_title("Input merged count", fontsize=m_fontsz)
hide = p.set(ylabel="Count")

plt.sca(axs[1][1])
# Input bed files' stats
p = sb.barplot(data=data, x="source", y="span_kept", order=orig_sources)
p.set_title("Input merged span (bp)", fontsize=m_fontsz)
hide = p.set(ylabel="Total Span")

plt.tight_layout()

In [None]:
# Input bed files stats table
data[data["source"].isin(orig_sources)][["source", 'count', 'tot_len', 'kept_count', 'span_kept']]

In [None]:
#Input sources merged stats
data[data["source"]=="grand"][["count", "tot_len"]]

In [None]:
# filtering (gaps) of the grand unified bed (a.k.a. the final)
data[data['source'] == 'final'][["count", "tot_len"]]

# TRF annotations

In [None]:
regions = pd.read_csv("../data/tr_regions.bed.gz", sep='\t', names=["chrom", "start", "end"])
annos = joblib.load("../data/tr_annotated.jl")
annos['key'] = annos.apply((lambda x: f"{x['chrom']}:{x['in_region_start']}-{x['in_region_end']}"), axis=1)

In [None]:
region_count = len(regions)
anno_count = len(annos['key'].unique())
print(f"We have a total of {region_count} regions")
print(f"We annotated {anno_count} regions")
print(f"With a total of {len(annos)} annotations...")
total_anno_span = (annos["end"] - annos["start"]).sum()
print(f"... spanning {total_anno_span}bp")
print("Annotation percent %.2f%%" % (anno_count / region_count * 100))

In [None]:
# Number of annotations per-regions

annos_per_region = annos.groupby('key').size()
labels = ["1", "[2,5)", "[5,10)", "10+"]
bins = pd.cut(annos_per_region, bins=[1, 2, 5, 10, 200], labels=labels, right=False )
view = pd.concat([annos_per_region, bins], axis=1)
view.columns = ["Count", "Bin"]
p = sb.countplot(data=view, x="Bin")
hide = p.set(title="Number of annotations per-region", xlabel="Number of Annotations", ylabel="Region Count")

In [None]:
regions_with_gt1 = (annos.groupby('key').size() > 1).sum()
print(f"We have {regions_with_gt1} regions with more than one TRF entry")

# Source to regions summary
How many of the source's merged.bed regions hit the tr_regions.bed

In [None]:
inter = joblib.load("../data/intersection.jl")

In [None]:
# Counts of intersection
summary_of_intersection = inter[inter['intersection'] != 0].groupby(['source', 'ro'])['count'].sum().unstack()

In [None]:
input_counts = data[['source', 'kept_count']].copy().set_index("source")
input_counts.columns = ['input_count']
summary_of_intersection = summary_of_intersection.join(input_counts, how='left')
summary_of_intersection.columns = ['in_tr_regions', 'in_tr_regions (50%ro)', 'input_count']

summary_of_intersection['pct_in'] = summary_of_intersection['in_tr_regions'] / summary_of_intersection['input_count']
summary_of_intersection['pct_in(50)'] = summary_of_intersection['in_tr_regions (50%ro)'] / summary_of_intersection['input_count']
summary_of_intersection

In [None]:
# And I want to do this again but with the annotated hits.
# This is harder because merged vs unmerged...

In [None]:
# Separate out regions that don't have annotations
# I'd lke to 