# Read in SNV vcfs and output dataframe 

August 14, 2020

I am finding myself needing to read in these intersection vcf files and convert them to a dataframe and copying and pasting all of this code over and over again. Instead, I am going to try making this notebook, where I can do all the coding to return the dataframe I want, and then I can just import this into all the other analysis notebooks where I use this dataframe. 

In [1]:
import glob
import pandas as pd 
import numpy as np

# Read in metadata 

In [2]:
def read_in_clades_file(clades_file):
    clades_dict = {}
    with open(clades_file, "r") as infile:
        for line in infile:
            strain_name = line.split("\t")[0]
            clade = line.split("\t")[1].strip()
        
            clades_dict[strain_name] = clade
    return(clades_dict)

In [3]:
def fix_strain_name(samplename):
    strain_name = samplename.replace("hCoV-19/","")

    if "USA" not in strain_name: 
        strain_name = "USA/" + strain_name
    if "/2020" not in strain_name:
        strain_name = strain_name + "/2020"
        
    return(strain_name)

In [4]:
def return_metadata_dict(metadata_file, clades_file):
    x = {}
    clades_dict = read_in_clades_file(clades_file)
    
    with open(metadata_file, "r") as infile: 
        for line in infile:
            if "Barcode" not in line:   # skip first line
                samplename = line.split("\t")[0]
                strain_name = fix_strain_name(samplename)
                geo = line.split("\t")[8].title()
                Ct1 = line.split("\t")[22]
                Ct2 = line.split("\t")[23]
                household = line.split("\t")[33]
                
                # read in clade
                if strain_name in clades_dict:
                    clade = clades_dict[strain_name]
                else:
                    clade = "unknown"
                
                if geo == "Oregon":
                    geo = "Oregon WI"
                if geo == "Columbus":
                    geo = "Columbus WI"
                if geo == "Verona":
                    geo = "Verona WI"
                if "Dane" in geo: 
                    geo = "Dane County"
                if geo == "Columbia County":
                    geo = "Columbia County WI"

                x[strain_name] = {"location":geo, "Ct1":Ct1, "Ct2": Ct2, "household":household, "clade":clade}
    
    return(x)

# The following functions are all for reading in csv iles and formatting into pandas dataframes

In [5]:
def return_list_of_vcfs(vcf_directory):
    vcf_list = []
    for f in glob.glob(vcf_directory + "*intersection.csv"):
        vcf_list.append(f)
    return(vcf_list)

In [6]:
def read_in_intersection_snvs(vcf_list, vcf_directory):
    within_host_df = pd.DataFrame()
    samples_without_variants = []
    
    for v in vcf_list:
        # pull out sampleid
        sampleid = v.replace(vcf_directory,"").replace("-intersection.csv","")
        
        d = pd.read_csv(v, sep="\t")
        d['sampleid'] = sampleid
        if len(d) == 0:
            samples_without_variants.append(sampleid)
        within_host_df = within_host_df.append(d)
        
    return(within_host_df, samples_without_variants)

In [7]:
def read_strain_names_from_csv(strain_names_file):
    strain_names_dict = {}
    
    with open(strain_names_file, "r") as infile: 
        for line in infile:
            if "Sample identifier" not in line:
                tube_number = line.split("\t")[1]
                samplename = line.split("\t")[0]
                strain_name = fix_strain_name(samplename)
                hospital_id = line.split("\t")[2]
                
                # there are 2 sets of tube numbers, some with leading 0s and others without. I am pretty sure we
                # want the ones with leading 0s; there are also some we don't want that have non-numeric tube #s
                if tube_number.isdigit() and tube_number.startswith("0"):
                    strain_names_dict[str(int(tube_number))] = strain_name
               
                else:
                    # I need to clean up the promega tube numbers as well
                    if "Promega" in hospital_id: 
                        new_tube_number = tube_number.replace(" (from State Lab via Promega)","").replace("nCov-","") + "P"
                        strain_names_dict[new_tube_number] = strain_name
                    else:
                        strain_names_dict[tube_number] = strain_name
                            
    
    return(strain_names_dict)

In [18]:
def format_snp_column(df):
    
    # first, clean up the SNP column, which is weirdly formatted when in intergenic regions
    df['SNP'] = df['SNP'].str.replace("CHR_START","CHRSTART")
    df['SNP'] = df['SNP'].str.replace("CHR_END","CHREND")
    df['SNP'] = df['SNP'].str.replace("_n.","_")
    df['SNP'] = df['SNP'].str.replace("__","_")
    
    # separate the SNP column into gene
    df['gene'] = df['SNP'].str.split("_", expand=True)[0]
    df['nt_change'] = df['SNP'].str.split("_", expand=True)[1]
    df['aa_change'] = df['SNP'].str.split("_", expand=True)[2]
    df['annotation'] = df['SNP'].str.split("_", expand=True)[3]
    
    # change % to string
    df = df.rename(columns = {'%':'frequency'})
    
    return(df)

In [20]:
def separate_snvs(within_host_df):
    # first, separate out and format snvs_df
    snvs_df = within_host_df[within_host_df['type_of_variant'] == "snv"]
    
    # the final fine and replaces are to fix the intergenic-region annotations
    snvs_df['aa_site'] = snvs_df['aa_change'].str[3:-3].str.replace("ergenic-reg","NA")
    snvs_df['wt_aa'] = snvs_df['aa_change'].str[0:3].str.replace("int","NA") 
    snvs_df['mut_aa'] = snvs_df['aa_change'].str[-3:].str.replace("ion","NA")
    
    # add in columns for nucleotide changes 
    snvs_df['nt_ref'] = snvs_df['nt_change'].str.split(">",expand=True)[0].str[-1:]
    snvs_df['nt_mut'] = snvs_df['nt_change'].str.split(">",expand=True)[1]
    # add in a column for the nucleotide mutation so that it is in the same format as the annotation on nextstrain
    snvs_df['nuc_muts'] = snvs_df['nt_ref'] + snvs_df["POS_x"].astype(int).astype(str) + snvs_df['nt_mut']

    return(snvs_df)

In [5]:
def classify_variant_as_indel_or_snv(nt_change):
    if "dup" in nt_change or "del" in nt_change:
        variant = "indel"
    else:
        variant = "snv"
    return(variant)

In [11]:
def add_variant_type_column(within_host_df):
    within_host_df['type_of_variant'] = within_host_df['nt_change'].apply(classify_variant_as_indel_or_snv)
    
    return(within_host_df)

In [2]:
def fix_annotation_for_intergenic_regions(within_host_df):
    #within_host_df['annotation'] = within_host_df['annotation'].replace('','intergenic_region')
    within_host_df.annotation.fillna(value='intergenic_region', inplace=True)
    return(within_host_df)

In [13]:
def convert_number_to_strain(sampleid, strain_names_dict):
    if sampleid in strain_names_dict:
        strain_name = strain_names_dict[sampleid]
    else:
        strain_name = "unknown"
        #print(sampleid, " does not have a strain name")
    return(strain_name)

In [14]:
def add_in_strain_column(df, strain_names_dict):
    temp_df = pd.DataFrame(df)
    strain_name = temp_df['sampleid'].apply(convert_number_to_strain, args=[strain_names_dict])
    #strain_name = temp_df['sampleid'].apply(lambda x: "USA/" + tube_number_conversion[x] + "/2020")
    temp_df["strain_name"] = strain_name
    return(temp_df)

## The following functions will look for consensus level variants and add in a column that correctly annotates the minor variant

In [1]:
def convert_high_freq_variants_to_minor_variants_snvs(row):
    frequency = row.frequency
    variant_nt = row.nt_mut
    variant_aa = row.mut_aa
    ref_nt = row.nt_ref
    ref_aa = row.wt_aa
    
    if frequency >= 0.5: 
        minor_frequency = 1 - frequency
        consensus_base = variant_nt
        consensus_aa = variant_aa
        minor_base = ref_nt
        minor_aa = ref_aa
    else:
        minor_frequency = frequency
        consensus_base = ref_nt
        consensus_aa = ref_aa
        minor_base = variant_nt
        minor_aa = variant_aa
    
    return(minor_frequency,consensus_base,minor_base,consensus_aa,minor_aa)

In [2]:
def add_minor_variant_column_snvs(df):
    temp_df = pd.DataFrame(df)
    a = temp_df[['gene','frequency','nt_ref','nt_mut','mut_aa','wt_aa']].apply(convert_high_freq_variants_to_minor_variants_snvs, axis=1)
    temp_df["a"] = a
    temp_df['minor_frequency'] = temp_df['a'].apply(lambda x: x[0])
    temp_df['consensus_base'] = temp_df['a'].apply(lambda x: x[1])
    temp_df['minor_base'] = temp_df['a'].apply(lambda x: x[2])
    temp_df['consensus_aa'] = temp_df['a'].apply(lambda x: x[3])
    temp_df['minor_aa'] = temp_df['a'].apply(lambda x: x[4])

    temp_df['minor_nuc_muts'] = temp_df['consensus_base'] + temp_df['POS_x'].astype(int).astype(str) + temp_df['minor_base']
    temp_df['minor_aa_muts'] = temp_df['gene'] + "_"+temp_df['consensus_aa'] + temp_df['aa_site'] + temp_df['minor_aa']
    
    temp_df.drop("a",axis=1,inplace=True)
    
    return(temp_df)

## Use metadata to add in other columns for location and other data

In [17]:
def add_clade(strain_name, metadata):
    if strain_name in metadata: 
        clade = metadata[strain_name]['clade']
    else:
        clade = "unknown"
    return(clade)

In [18]:
def add_location(strain_name, metadata):
    if strain_name in metadata: 
        location = metadata[strain_name]['location']
    else:
        location = "unknown"
    return(location)

In [19]:
def add_metadata_columns(df, metadata):
    temp_df = pd.DataFrame(df)
    location = temp_df['strain_name'].apply(add_location, args=[metadata])
    clade = temp_df['strain_name'].apply(add_clade, args=[metadata])
    #strain_name = temp_df['sampleid'].apply(lambda x: "USA/" + tube_number_conversion[x] + "/2020")
    temp_df["location"] = location
    temp_df["clade"] = clade
    return(temp_df)

## Read in homopolymer annotation module

In [1]:
%run homopolymer-module.ipynb

## Run all of the above together to output all of the necessary dataframes

In [3]:
def return_dataframes(metadata_file, clades_file, vcf_directory, to_ignore, fasta_file_path, homopolymer_length):
    strain_names_dict = read_strain_names_from_csv(metadata_file)
    metadata_dict = return_metadata_dict(metadata_file, clades_file)
    vcfs = return_list_of_vcfs(vcf_directory)
    
    # read in intersection snvs, format the columns, and return intersection snvs dataframe
    all_intersection_variants, samples_without_variants = read_in_intersection_snvs(vcfs, vcf_directory)
    all_intersection_variants = format_snp_column(all_intersection_variants)
    all_intersection_variants = add_in_strain_column(all_intersection_variants, strain_names_dict)
    all_intersection_variants = all_intersection_variants[~all_intersection_variants['sampleid'].isin(to_ignore)]
    all_intersection_variants = fix_annotation_for_intergenic_regions(all_intersection_variants)
    
    # add in location and clade data
    all_intersection_variants = add_metadata_columns(all_intersection_variants, metadata_dict)
    
    # add in homopolymer annotation
    all_intersection_variants = add_homopolymer_annotation(fasta_file_path, all_intersection_variants, homopolymer_length)
    
    # separate out snvs and indels
    all_intersection_variants = add_variant_type_column(all_intersection_variants)
    snvs_only = separate_snvs(all_intersection_variants)
    
    # add in a column for the minor variants 
    snvs_only = add_minor_variant_column_snvs(snvs_only)
    
    # print out samples that don't have variants
    for s in samples_without_variants:
        if s not in to_ignore:
            print("tube", s, "strain", strain_names_dict[s], "does not have any variants")
    
    return(snvs_only,all_intersection_variants, metadata_dict, strain_names_dict)