In [1]:
import os
import glob

# Define the directory where VCF files are stored
vcf_directory = "/Users/asjaeger/Desktop/project_source/hpai_PA/seq/illumina-pipeline/for_SNPgenie/shared_vars_only_test/"

# Find all VCF files
vcf_files = glob.glob(os.path.join(vcf_directory, "*.vcf"))

# Group files by sample type (without the replicate number)
sample_groups = {}
for vcf in vcf_files:
    filename = os.path.basename(vcf)
    base_sample = "_".join(filename.split("_")[:-1])  # Remove replicate number
    if base_sample not in sample_groups:
        sample_groups[base_sample] = []
    sample_groups[base_sample].append(vcf)

# Function to parse VCF and extract variants
def get_variants(vcf_file):
    variants = set()
    with open(vcf_file, "r") as infile:
        for line in infile:
            if not line.startswith("#"):  # Ignore header lines
                variant_info = "\t".join(line.strip().split("\t")[:5])  # Use CHROM, POS, ID, REF, ALT as unique identifier
                variants.add(variant_info)
    return variants

# Compare replicates and write new filtered VCFs
for sample, replicates in sample_groups.items():
    if len(replicates) > 1:  # Only process samples with replicates
        print(f"Processing replicates for sample: {sample}")

        # Get shared variants across replicates
        shared_variants = get_variants(replicates[0])
        for replicate in replicates[1:]:
            shared_variants &= get_variants(replicate)  # Keep only common variants

        # Create new filtered VCFs
        for vcf in replicates:
            output_file = vcf.replace(".vcf", "_shared.vcf")  # New VCF filename
            with open(output_file, "w") as outfile, open(vcf, "r") as infile:
                for line in infile:
                    if line.startswith("#"):
                        outfile.write(line)  # Write header
                    else:
                        variant_info = "\t".join(line.strip().split("\t")[:5])
                        if variant_info in shared_variants:
                            outfile.write(line)  # Write only shared variants

            print(f"Created filtered VCF: {output_file}")

print("Filtering complete!")


Processing replicates for sample: be_w1
Created filtered VCF: /Users/asjaeger/Desktop/project_source/hpai_PA/seq/illumina-pipeline/for_SNPgenie/shared_vars_only_test/be_w1_replicate-1_shared.vcf
Created filtered VCF: /Users/asjaeger/Desktop/project_source/hpai_PA/seq/illumina-pipeline/for_SNPgenie/shared_vars_only_test/be_w1_replicate-2_shared.vcf
Filtering complete!
