In [None]:
# calculate allele frequencies from VCF (Variant Call Format) files generated by different variant calling tools. 
#The script reads VCF files, extracts the necessary data for allele frequencies, computes the frequencies for reference and alternate alleles, and then stores the results in a CSV file. 
#It is built to handle VCF files from multiple tools, such as DeepVariant, Strelka, Octopus, and Freebayes.

import pandas as pd
from cyvcf2 import VCF

def calculate_allele_frequencies(vcf_path, ref_field, alt_field, total_depth_field=None):
    vcf = VCF(vcf_path)
    allele_frequencies = []

    for variant in vcf:
        # Extract necessary fields from the VCF
        total_depth = variant.INFO.get(total_depth_field) if total_depth_field else None
        ref_count = variant.INFO.get(ref_field)
        alt_count = variant.INFO.get(alt_field)

        if total_depth is None and ref_count is None and alt_count is None:
            continue
        
        # Convert alt_count to a list if necessary
        if isinstance(alt_count, tuple):
            alt_count = list(alt_count)
        elif isinstance(alt_count, int):
            alt_count = [alt_count]

        # Calculate allele frequencies
        if total_depth is not None:
            # If total depth is provided, use it for calculating frequencies
            ref_freq = ref_count / total_depth if ref_count is not None else 0
            alt_freqs = [count / total_depth for count in alt_count]
        else:
            # If no total depth, calculate frequency based on ref + alt counts
            total_alleles = ref_count + sum(alt_count)
            if total_alleles == 0:
                continue
            ref_freq = ref_count / total_alleles
            alt_freqs = [count / total_alleles for count in alt_count]

        # Store the frequencies in a dictionary
        allele_frequencies.append({
            'Position': f"{variant.CHROM}:{variant.POS}",
            'Ref Freq': ref_freq,
            **{f"Alt Freq {i+1}": freq for i, freq in enumerate(alt_freqs)}
        })

    # Convert the results into a pandas DataFrame
    return pd.DataFrame(allele_frequencies)

# Example usage for different tools:

# DeepVariant and Strelka (using DP and AD fields)
frequencies_df_dv = calculate_allele_frequencies(
    vcf_path='/home/dingrongruo.yu/liz.9.11.19_GMVLE/results/variants/HG003_NA24149_Ashkenazim_father.trim.dv.vcf',
    ref_field='AD', alt_field='AD', total_depth_field='DP'
)
frequencies_df_dv.to_csv('/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/output_dv.csv', index=False)

# Octopus (using AC and AN fields)
frequencies_df_oc = calculate_allele_frequencies(
    vcf_path='/home/dingrongruo.yu/liz.9.11.19_GMVLE/results/variants/HG003_NA24149_Ashkenazim_father.trim.oc.vcf',
    ref_field='AN', alt_field='AC'
)
frequencies_df_oc.to_csv('/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/output_oc.csv', index=False)

# Freebayes (using DP, RO, AO fields)
frequencies_df_fb = calculate_allele_frequencies(
    vcf_path='/home/dingrongruo.yu/liz.9.11.19_GMVLE/results/variants/HG003_NA24149_Ashkenazim_father.trim.fb.vcf',
    ref_field='RO', alt_field='AO', total_depth_field='DP'
)
frequencies_df_fb.to_csv('/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/output_fb.csv', index=False)
