# Filter initial COLO829BLT in-vitro variants. Filter out variants present in COLO829T and COLO8BLT (Illumina and PacBio). COLO829BL-specific variants to save in a VCF file. 

In [2]:
from granite.lib import vcf_parser

def load_variants(vcf):
    vcf_obj = vcf_parser.Vcf(vcf)
    variants = []
    for vnt in vcf_obj.parse_variants():
        if vnt.ALT in ["T","C", "G", "A"]:
            variants.append(f"{vnt.CHROM}_{vnt.POS}_{vnt.REF}_{vnt.ALT}")
    return set(variants)


In [3]:
ways = ["1", "2", "3", "4", "5"]
with open(f"/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/invitro_variants/Utah_Steph_analysis//BL_only/COLO829BLT.BLonly.vcf.gz", "w") as blonly_vcf:
    with open(f"/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/invitro_variants/Utah_Steph_analysis//in-vitro-final/COLO829BLT.finalInVitro.vcf.gz", "w") as invitrofinal_vcf: 
        for way in ways:

            colo829_invitro_variants_vcf = vcf_parser.Vcf(f"/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/invitro_variants/Utah_Steph_analysis/input/masked_blt50.2read_cutoff.way_{way}.vcf.gz")
            colo829_invitro_validated = load_variants(f"/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/invitro_variants/Utah_Steph_analysis/input/masked_blt50.2read_cutoff.way_{way}.vcf.gz")
            negative_control_overlap = load_variants(f"/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/invitro_variants/Utah_Steph_analysis/analysis/intersect_negativecontrol/after_update_Sep21/masked_blt50.2read_cutoff.way_{way}.intersectnc.vcf")
            
            pu_vcf = vcf_parser.Vcf(f"/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/invitro_variants/Utah_Steph_analysis/PU_norm/masked_blt50.2read_cutoff.way_{way}.PU.norm.vcf")

            if way == "1":
                colo829_invitro_variants_vcf.write_header(blonly_vcf)
                colo829_invitro_variants_vcf.write_header(invitrofinal_vcf)

            COLO829BL_illumina = "/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/TruthSet/input/merge_three_bams/COLO829BL_Ill_230X.bam"
            COLO829BL_pacbio = "/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/truthset_updated/bams/COLO829BL_Hifi.bam"

            COLO829T_illumina = "/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/TruthSet/input/merge_three_bams/COLO829T_Ill_200X.bam"
            COLO829T_pacbio = "/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/truthset_updated/bams/COLO829T_Hifi.bam"

            pu_variants = []
            bl_only = []
            
            for vnt in pu_vcf.parse_variants():
                
                vnt_id = f"{vnt.CHROM}_{vnt.POS}_{vnt.REF}_{vnt.ALT}"
                
                if vnt_id in colo829_invitro_validated:

                    genotypes = pu_vcf.header.IDs_genotypes


                    PacBio_AD = int(vnt.get_genotype_value(COLO829BL_pacbio, "AD").split(",")[-1])
                    Illumina_AD =  int(vnt.get_genotype_value(COLO829BL_illumina, "AD").split(",")[-1])

                    COLO829T_pacbio_ad = int(vnt.get_genotype_value(COLO829T_pacbio, "AD").split(",")[-1])
                    COLO829T_illumina_ad = int(vnt.get_genotype_value(COLO829T_illumina, "AD").split(",")[-1])

                
                    if PacBio_AD > 0 or Illumina_AD > 0 or  COLO829T_illumina_ad > 0 or COLO829T_pacbio_ad > 0:
                        pu_variants.append(vnt_id)
                    
                    if (PacBio_AD > 0 or Illumina_AD > 0 ) and  (COLO829T_illumina_ad == 0 and COLO829T_pacbio_ad == 0):
                        bl_only.append(vnt_id)

            colo829_invitro_variants_vcf = vcf_parser.Vcf(f"/n/data1/hms/dbmi/park/jiny/SMaHT/COLO829/invitro_variants/Utah_Steph_analysis/input/masked_blt50.2read_cutoff.way_{way}.vcf.gz")
            final_invitro =  (colo829_invitro_validated - set(pu_variants)) & set(negative_control_overlap)
            
                

            for vnt in colo829_invitro_variants_vcf.parse_variants():
                vnt_id = f"{vnt.CHROM}_{vnt.POS}_{vnt.REF}_{vnt.ALT}"
                if vnt_id in bl_only:
                    colo829_invitro_variants_vcf.write_variant(blonly_vcf, vnt)
                if vnt_id in final_invitro:
                    vnt.add_tag_info(f"GCCSup={way}")
                    colo829_invitro_variants_vcf.write_variant(invitrofinal_vcf, vnt)
