In [40]:
import pandas as pd
import pathlib as pl
import io
import shutil as sh

error_dir = pl.Path("/home/ebertp/work/projects/sig_chry/error_annotations")
vc_dir = pl.Path("/home/ebertp/work/projects/sig_chry/variant_calls/PAV/freeze3/tsv")

def extract_tig_coords(row):
    tig_name, coords = row["TIG_REGION"].split(":")
    start, end = coords.split("-")
    return tig_name, int(start), int(end)+1


all_variants = []
for vc_table in vc_dir.glob("*.tsv.gz"):
    df = pd.read_csv(vc_table, sep="\t", header=0)
    df[["tig_name", "tig_start", "tig_end"]] = df.apply(extract_tig_coords, result_type="expand", axis=1)
    df = df[["tig_name", "tig_start", "tig_end", "ID", "SVTYPE", "SVLEN"]]
    all_variants.append(df)
    
merged_variants = pd.concat(all_variants, axis=0, ignore_index=False)
merged_variants.sort_values(["tig_name", "tig_start", "tig_end"], ascending=True, inplace=True)

merged_variants.to_csv(vc_dir.parent.joinpath("all_variants.tsv"), header=False, index=False, sep="\t")

all_errors = []
for error_table in error_dir.glob("*.errors.tsv"):
    sample = error_table.name.split(".")[0]
    if sample in ["SAMPLES", "NA24385", "HG03456"]:
        print("skipping ", sample)
        continue
    df = pd.read_csv(error_table, sep="\t", header=0)
    all_errors.append(df)
    
all_errors = pd.concat(all_errors, axis=0, ignore_index=False)
all_errors.sort_values(["chrom", "start", "end"], ascending=True, inplace=True)

all_errors.to_csv(error_dir.parent.joinpath("all_errors.tsv"), header=False, index=False, sep="\t")

isect_dist = pl.Path("/home/ebertp/work/projects/sig_chry/intersect_dist.tsv")

df = pd.read_csv(isect_dist, sep="\t", header=None)
print(df.head())
print(df[13].describe())
