# core

> Fill in a module description here

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import subprocess
import os

def count_variants(vcf_file):
    """Count the number of variants in a VCF file using subprocess."""
    if vcf_file.endswith('.gz'):
        cmd = f"bcftools view -H {vcf_file} | wc -l"
    else:
        cmd = f"grep -v '^#' {vcf_file} | wc -l"
    
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return int(result.stdout.strip())

def filter_variants():
    # Define input and output VCF files
    INPUT_VCF = "../data/freebayes.annotated_pc1.vcf.gz"
    QUAL_FILTERED_VCF = "../data/filtered_qual.vcf"
    DP_FILTERED_VCF = "../data/filtered_dp.vcf"
    SNP_FILTERED_VCF = "../data/filtered_snp.vcf"
    FINAL_VCF = "../data/filtered_final.vcf"
    
    print("======================================")
    print("Starting Variant Filtering Process")
    print("======================================")
    
    # Count initial number of variants
    START_COUNT = count_variants(INPUT_VCF)
    print(f"Total variants before filtering: {START_COUNT}")
    
    # Step 1: Filter out low-quality variants (QUAL < 30)
    subprocess.run(f"bcftools filter -e 'QUAL < 30' {INPUT_VCF} -o {QUAL_FILTERED_VCF}", shell=True)
    QUAL_FILTERED_COUNT = count_variants(QUAL_FILTERED_VCF)
    print(f"Stage 1: QUAL filtering: {START_COUNT - QUAL_FILTERED_COUNT} Variants removed and {QUAL_FILTERED_COUNT} variants left")

    # Step 2: Filter variants based on per-sample depth (FORMAT/DP < 10 or > 150)
    subprocess.run(f"bcftools view -i 'FMT/DP >= 30 & FMT/DP <= 150' {QUAL_FILTERED_VCF} -o {DP_FILTERED_VCF}", shell=True)
    DP_FILTERED_COUNT = count_variants(DP_FILTERED_VCF)
    print(f"Stage 2: FORMAT/DP filtering, DP >= 30 & DP <= 150: {QUAL_FILTERED_COUNT - DP_FILTERED_COUNT} Variants removed and {DP_FILTERED_COUNT} variants left")

    # Step 3: Retain SNPs and indels (Remove other variant types if any)
    subprocess.run(f"bcftools view -v snps,indels {DP_FILTERED_VCF} -o {SNP_FILTERED_VCF}", shell=True)
    SNP_FILTERED_COUNT = count_variants(SNP_FILTERED_VCF)
    print(f"Stage 3: After keeping SNPs and indels: {DP_FILTERED_COUNT - SNP_FILTERED_COUNT} Variants removed and {SNP_FILTERED_COUNT} variants left")

    # Rename final output
    os.rename(SNP_FILTERED_VCF, FINAL_VCF)
    FINAL_COUNT = count_variants(FINAL_VCF)


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()