In [None]:
import cyvcf2
import requests
import csv
import pickle

def parse_vcf_entry(entry):
    """
    Parses a single VCF entry to extract essential information for each alternate allele.
    """
    base_data = {
        "chrom": entry.CHROM,
        "pos": entry.POS,
        "ref": entry.REF,
        "depth": entry.INFO.get('DP'),  # Extracting depth from VCF entry
    }
    return [{"alt": alt, **base_data} for alt in entry.ALT]

def query_ensembl(variant):
    """
    Queries the Ensembl VEP endpoint using SPDI notation for each variant.
    """
    url = "http://grch37.rest.ensembl.org/vep/human/region"
    headers = {"Content-Type": "application/json"}
    adjusted_position = variant['pos'] + 1
    spdi_notation = f"{variant['chrom']}:{adjusted_position}:{variant['ref']}:{variant['alt']}"

    payload = {"variants": [spdi_notation]}

    try:
        response = requests.post(url, json=payload, headers=headers)
        if response.status_code == 200:
            return response.json()
        else:
            return {}
    except Exception as e:
        print(f"An error occurred during the API call: {e}")
        return {}

def write_to_csv(data, filename):
    """
    Writes processed data to a CSV file.
    """
    fieldnames = ["chrom", "pos", "ref", "alt", "depth", "alt_reads", "percent_alt_reads", "percent_ref_reads", "gene", "variant_effect", "minor_allele", "minor_allele_frequency", "somatic", "id"]
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

def process_ensembl_data(ensembl_data_item, vcf_data):
    """
    Process a single item from Ensembl data to fit the CSV structure.

    Args:
        ensembl_data_item (dict): A single item from Ensembl response.
        vcf_data (dict): The VCF data for the corresponding variant.

    Returns:
        dict: Processed Ensembl data for CSV output.
    """
    processed_data = {
        "gene": '',
        "variant_effect": '',
        "minor_allele": '',
        "minor_allele_frequency": '',
        "somatic": '',
        "alt_reads": '',  # Placeholder for alternate reads
        "percent_alt_reads": '',
        "percent_ref_reads": ''
    }

    # Extracting gene and variant effect
    if 'transcript_consequences' in ensembl_data_item:
        first_consequence = ensembl_data_item['transcript_consequences'][0]
        processed_data['gene'] = first_consequence.get('gene_symbol', '')
        processed_data['variant_effect'] = first_consequence.get('impact', '')

    # Extracting minor allele and its frequency
    colocated_variants = ensembl_data_item.get('colocated_variants', [])
    if colocated_variants:
        first_variant = colocated_variants[0]
        processed_data['minor_allele'] = first_variant.get('minor_allele', '')
        processed_data['minor_allele_frequency'] = first_variant.get('minor_allele_freq', '')
    
    # Check if the variant is somatic
    processed_data['somatic'] = '1' if any('somatic' in var for var in colocated_variants) else ''

    # Calculating alternate and reference reads percentages
    if vcf_data['depth']:
        processed_data['alt_reads'] = vcf_data.get('alt_reads', '')
        if processed_data['alt_reads']:
            depth = int(vcf_data['depth'])
            alt_reads = int(processed_data['alt_reads'])
            processed_data['percent_alt_reads'] = str(round(alt_reads / depth * 100, 2))
            processed_data['percent_ref_reads'] = str(round((depth - alt_reads) / depth * 100, 2))

    # Calculate percent_alt_reads and percent_ref_reads
    depth = int(vcf_data.get('depth', 0))
    alt_reads = int(vcf_data.get('alt_reads', 0))
    if depth > 0:
        processed_data['percent_alt_reads'] = str(round(alt_reads / depth * 100, 2))
        processed_data['percent_ref_reads'] = str(round((depth - alt_reads) / depth * 100, 2))

    return processed_data


def main(vcf_file, output_csv, output_pkl, full=False):
    """
    Main function to process VCF file and write to CSV and pickle files.
    """
    vcf_reader = cyvcf2.VCF(vcf_file)
    all_data = []

    count = 0
    for record in vcf_reader:
        
        if full :
            for vcf_data in parse_vcf_entry(record):
                ensembl_data_list = query_ensembl(vcf_data)

                if isinstance(ensembl_data_list, list) and ensembl_data_list:
                    for ensembl_data in ensembl_data_list:
                        combined_data = {**vcf_data, **process_ensembl_data(ensembl_data, vcf_data)}
                        all_data.append(combined_data)
                else:
                    combined_data = {**vcf_data}
                    all_data.append(combined_data)
        else :
            for vcf_data in parse_vcf_entry(record):
                ensembl_data_list = query_ensembl(vcf_data)

                if isinstance(ensembl_data_list, list) and ensembl_data_list:
                    for ensembl_data in ensembl_data_list:
                        combined_data = {**vcf_data, **process_ensembl_data(ensembl_data, vcf_data)}
                        all_data.append(combined_data)
                else:
                    combined_data = {**vcf_data}
                    all_data.append(combined_data)
            count = count + 1
            if count > 20:
                break

    with open(output_pkl, 'wb') as output_file:
        pickle.dump(all_data, output_file)

    write_to_csv(all_data, output_csv)


# if __name__ == "__main__":
main("/qmd/VCF/test_vcf.txt", "/qmd/VCF/outputTest.csv", "/qmd/VCF/outputTest.pkl", full=False)
