In [62]:
import pandas as pd
import numpy as np
import io
import gdreg

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

def get_info_ann(anno_line):
    for anno in anno_line.split(';'):
        if anno[0:3] == 'ANN':
            annotation_list = [
                x.split('|')[1] for x in anno.split(',') if ('WARNING' not in x) & ('ERROR' not in x )]
            annotation_list = '&'.join(annotation_list)
            annotation_list = set(annotation_list.split('&'))
            return ','.join(annotation_list)
    return ''

In [60]:
dic_func = {
    "intergenic_region" : ["intergenic_region"],
    "intron_variant" : ["intron_variant"],
    "downstream_gene_variant" : ["downstream_gene_variant"],
    "upstream_gene_variant" : ["upstream_gene_variant"],
    "3_prime_UTR_variant" : ["3_prime_UTR_variant"],
    "5_prime_UTR_variant" : ["5_prime_UTR_variant"],
    "non_coding_transcript_exon_variant" : ["non_coding_transcript_exon_variant"],
    "splice_region_variant" : ["splice_region_variant"],
    # Based on Impact
    'nonsynonymous_variant': [
        # LOF from Backman_Nature_2021
        'stop_gained',
        'stop_lost', 
        'splice_donor_variant',
        'splice_acceptor_variant',
        'start_lost', 
        'frameshift_variant',
        # missense_variant
        "missense_variant", 
        "5_prime_UTR_premature_start_codon_gain_variant",
        "structural_interaction_variant",
        "disruptive_inframe_deletion", 
        "disruptive_inframe_insertion", 
        "conservative_inframe_deletion", 
        "conservative_inframe_insertion", 
    ],
    'synonymous_variant': ["synonymous_variant"],
    
}

In [65]:
# Read annotation
VCF_FILE = "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/baseline_annot/snfeff/ukb_imp_chr@_v3.vcf"
OUT_FILE = "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/baseline_annot/snfeff/ukb_imp_chr@_v3.annot.gz"

# ANN_list
ANN_list = []
for temp_list in dic_func.values():
    ANN_list.extend(temp_list)

for CHR in range(1,23):
# for CHR in [21]:
    print("# CHR %d" % CHR)
    df_vcf = read_vcf(VCF_FILE.replace("@", "%d"%CHR))
    df_vcf.index = df_vcf['ID']
    df_vcf = df_vcf[['CHROM', 'ID', 'POS', 'REF', 'ALT', 'INFO', 'FORMAT']]
    
    # Get functional annotation
    df_vcf["INFO_ANN"] = [get_info_ann(x) for x in df_vcf['INFO']]
    for ANN in ANN_list:
        df_vcf[ANN] = [ANN in x.split(",") for x in df_vcf["INFO_ANN"]]
    for term in dic_func:
        df_vcf["AN:snpeff_%s" % term] = df_vcf[dic_func[term]].sum(axis=1).astype(bool)
        
    snpeff_list = [x for x in df_vcf if x.startswith("AN:")]
    df_vcf = df_vcf[["CHROM", "ID", "POS"] + snpeff_list]
    df_vcf.columns = ['CHR', 'SNP', 'BP'] + snpeff_list
    for AN in snpeff_list:
        print("    %s %d" % (AN, df_vcf[AN].sum()))
    gdreg.util.write_annot(df_vcf, OUT_FILE.replace("@", "%d"%CHR))

# CHR 1
    AN:snpeff_intergenic_region 531593
    AN:snpeff_intron_variant 596629
    AN:snpeff_downstream_gene_variant 178411
    AN:snpeff_upstream_gene_variant 176916
    AN:snpeff_3_prime_UTR_variant 14813
    AN:snpeff_5_prime_UTR_variant 3612
    AN:snpeff_non_coding_transcript_exon_variant 24364
    AN:snpeff_splice_region_variant 1912
    AN:snpeff_nonsynonymous_variant 7545
    AN:snpeff_synonymous_variant 4470
'CM' missing from df_annot.columns, add 'CM' column with 0
# CHR 2
    AN:snpeff_intergenic_region 570569
    AN:snpeff_intron_variant 658041
    AN:snpeff_downstream_gene_variant 150410
    AN:snpeff_upstream_gene_variant 144003
    AN:snpeff_3_prime_UTR_variant 10795
    AN:snpeff_5_prime_UTR_variant 2536
    AN:snpeff_non_coding_transcript_exon_variant 20306
    AN:snpeff_splice_region_variant 1358
    AN:snpeff_nonsynonymous_variant 5036
    AN:snpeff_synonymous_variant 3198
'CM' missing from df_annot.columns, add 'CM' column with 0
# CHR 3
    AN:snpeff_intergenic

    AN:snpeff_intergenic_region 213028
    AN:snpeff_intron_variant 201167
    AN:snpeff_downstream_gene_variant 46469
    AN:snpeff_upstream_gene_variant 44913
    AN:snpeff_3_prime_UTR_variant 3021
    AN:snpeff_5_prime_UTR_variant 734
    AN:snpeff_non_coding_transcript_exon_variant 6098
    AN:snpeff_splice_region_variant 340
    AN:snpeff_nonsynonymous_variant 1200
    AN:snpeff_synonymous_variant 783
'CM' missing from df_annot.columns, add 'CM' column with 0
# CHR 19
    AN:snpeff_intergenic_region 115676
    AN:snpeff_intron_variant 197164
    AN:snpeff_downstream_gene_variant 112121
    AN:snpeff_upstream_gene_variant 108121
    AN:snpeff_3_prime_UTR_variant 9225
    AN:snpeff_5_prime_UTR_variant 2771
    AN:snpeff_non_coding_transcript_exon_variant 16342
    AN:snpeff_splice_region_variant 1369
    AN:snpeff_nonsynonymous_variant 5909
    AN:snpeff_synonymous_variant 3931
'CM' missing from df_annot.columns, add 'CM' column with 0
# CHR 20
    AN:snpeff_intergenic_region 159009