# Read in SNV vcfs and output dataframe 

August 14, 2020

I am finding myself needing to read in these intersection vcf files and convert them to a dataframe and copying and pasting all of this code over and over again. Instead, I am going to try making this notebook, where I can do all the coding to return the dataframe I want, and then I can just import this into all the other analysis notebooks where I use this dataframe. 

In [28]:
import glob
import pandas as pd 
import numpy as np

# Read in metadata 

In [47]:
def read_in_clades_file(clades_file):
    clades_dict = {}
    with open(clades_file, "r") as infile:
        for line in infile:
            strain_name = line.split("\t")[0]
            clade = line.split("\t")[1].strip()
        
            clades_dict[strain_name] = clade
    return(clades_dict)

In [48]:
def fix_strain_name(samplename):
    strain_name = samplename.replace("hCoV-19/","")

    if "USA" not in strain_name: 
        strain_name = "USA/" + strain_name
    if "/2020" not in strain_name:
        strain_name = strain_name + "/2020"
        
    return(strain_name)

In [54]:
def return_metadata_dict(metadata_file, clades_file):
    x = {}
    clades_dict = read_in_clades_file(clades_file)
    
    with open(metadata_file, "r") as infile: 
        for line in infile:
            if "Barcode" not in line:   # skip first line
                samplename = line.split("\t")[0]
                strain_name = fix_strain_name(samplename)
                geo = line.split("\t")[8].title()
                Ct1 = line.split("\t")[22]
                Ct2 = line.split("\t")[23]
                household = line.split("\t")[33]
                
                # read in clade
                if strain_name in clades_dict:
                    clade = clades_dict[strain_name]
                else:
                    clade = "unknown"
                
                if geo == "Oregon":
                    geo = "Oregon WI"
                if geo == "Columbus":
                    geo = "Columbus WI"
                if geo == "Verona":
                    geo = "Verona WI"
                if "Dane" in geo: 
                    geo = "Dane County"

                x[strain_name] = {"location":geo, "Ct1":Ct1, "Ct2": Ct2, "household":household, "clade":clade}
    
    return(x)

# The following functions are all for reading in csv iles and formatting into pandas dataframes

In [31]:
def return_list_of_vcfs(vcf_directory):
    vcf_list = []
    for f in glob.glob(vcf_directory + "*intersection.csv"):
        vcf_list.append(f)
    return(vcf_list)

In [32]:
def read_in_intersection_snvs(vcf_list, vcf_directory):
    within_host_df = pd.DataFrame()
    
    for v in vcf_list:
        # pull out sampleid
        sampleid = v.replace(vcf_directory,"").replace("-intersection.csv","")
        
        d = pd.read_csv(v, sep="\t")
        d['sampleid'] = sampleid
        within_host_df = within_host_df.append(d)
        
    return(within_host_df)

In [46]:
def read_strain_names_from_csv(strain_names_file):
    strain_names_dict = {}
    
    with open(strain_names_file, "r") as infile: 
        for line in infile:
            if "Sample identifier" not in line:
                tube_number = line.split("\t")[1]
                samplename = line.split("\t")[0]
                strain_name = fix_strain_name(samplename)
                hospital_id = line.split("\t")[2]
                
                # there are 2 sets of tube numbers, some with leading 0s and others without. I am pretty sure we
                # want the ones with leading 0s; there are also some we don't want that have non-numeric tube #s
                if tube_number.isdigit() and tube_number.startswith("0"):
                    strain_names_dict[str(int(tube_number))] = strain_name
               
                else:
                    # I need to clean up the promega tube numbers as well
                    if "Promega" in hospital_id: 
                        new_tube_number = tube_number.replace(" (from State Lab via Promega)","").replace("nCov-","") + "P"
                        strain_names_dict[new_tube_number] = strain_name
                    else:
                        strain_names_dict[tube_number] = strain_name
                            
    
    return(strain_names_dict)

In [34]:
def format_snp_column(df):
    
    # separate the SNP column into gene
    df['annotation'] = df['SNP'].str.split("_", expand=True)[3]
    df['gene'] = df['SNP'].str.split("_", expand=True)[0]
    df['nt_change'] = df['SNP'].str.split("_", expand=True)[1]
    df['aa_change'] = df['SNP'].str.split("_", expand=True)[2]
    
    # change % to string
    df = df.rename(columns = {'%':'frequency'})
    
    return(df)

In [35]:
def separate_snvs(within_host_df):
    # first, separate out and format snvs_df
    snvs_df = within_host_df[(within_host_df['annotation'] == "missense") | (within_host_df['annotation'] == "synonymous") | (within_host_df['annotation'] == "stop")]
    snvs_df['aa_site'] = snvs_df['aa_change'].str[3:-3]
    snvs_df['wt_aa'] = snvs_df['aa_change'].str[0:3]
    snvs_df['mut_aa'] = snvs_df['aa_change'].str[-3:]
    
    # add in columns for nucleotide changes 
    snvs_df['nt_ref'] = snvs_df['nt_change'].str.split(">",expand=True)[0].str[-1:]
    snvs_df['nt_mut'] = snvs_df['nt_change'].str.split(">",expand=True)[1]
    # add in a column for the nucleotide mutation so that it is in the same format as the annotation on nextstrain
    snvs_df['nuc_muts'] = snvs_df['nt_ref'] + snvs_df["POS_x"].astype(int).astype(str) + snvs_df['nt_mut']

    return(snvs_df)

In [36]:
def format_indels(row):
    
    if "dup" in row['nt_change']:
        split_char = "dup"
        variant = row['nt_change'].split(split_char)[1]
        new_value = "-" + str(int(row["POS_x"])) + variant
        nt_ref = "-"
        nt_mut = variant
    
    elif "del" in row['nt_change']:
        split_char = "del"
        variant = row['nt_change'].split(split_char)[1]
        new_value = str(int(row['POS_x'])) +  variant + "-"
        nt_ref = variant
        nt_mut = "-"
    
    return(new_value, nt_ref, nt_mut)

In [37]:
def return_indel_type(row):
    
    if "dup" in row['nt_change']:
        type_change = "insertion"
     
    elif "del" in row['nt_change']:
        type_change = "deletion"
    
    return(type_change)

In [38]:
def separate_indels(within_host_df):
        
    # now, separate out and format indels dataframe
    indels_df = within_host_df[(within_host_df['annotation'] == "frameshift") | (within_host_df['annotation'] == "frameshift&stop")]
    
    # add a column with formatted indel that matches nextstrain
    # add in nt_ref and nt_mut columns so that they match snvs dataframe
    indels_df['a'] = indels_df.apply(format_indels, axis=1)
    indels_df['nuc_muts'] =indels_df['a'].apply(lambda x: x[0])
    indels_df['nt_ref'] =indels_df['a'].apply(lambda x: x[1])
    indels_df['nt_mut'] =indels_df['a'].apply(lambda x: x[2])
    indels_df.drop("a",axis=1,inplace=True)
    
    indels_df['type_mut'] = indels_df.apply(return_indel_type, axis=1)
    
    return(indels_df)

In [39]:
def convert_number_to_strain(sampleid, strain_names_dict):
    if sampleid in strain_names_dict:
        strain_name = strain_names_dict[sampleid]
    else:
        strain_name = "unknown"
        #print(sampleid, " does not have a strain name")
    return(strain_name)

In [40]:
def add_in_strain_column(df, strain_names_dict):
    temp_df = pd.DataFrame(df)
    strain_name = temp_df['sampleid'].apply(convert_number_to_strain, args=[strain_names_dict])
    #strain_name = temp_df['sampleid'].apply(lambda x: "USA/" + tube_number_conversion[x] + "/2020")
    temp_df["strain_name"] = strain_name
    return(temp_df)

## The following functions will look for consensus level variants and add in a column that correctly annotates the minor variant

In [41]:
def convert_high_freq_variants_to_minor_variants(row):
    frequency = row.frequency
    variant_nt = row.nt_mut
    ref_nt = row.nt_ref
    
    if frequency >= 0.5: 
        minor_frequency = 1 - frequency
        consensus_base = variant_nt
        minor_base = ref_nt
    else:
        minor_frequency = frequency
        consensus_base = ref_nt
        minor_base = variant_nt
    
    return(minor_frequency,consensus_base,minor_base)

In [42]:
def add_minor_variant_column(df):
    temp_df = pd.DataFrame(df)
    a = temp_df[['frequency','nt_ref','nt_mut']].apply(convert_high_freq_variants_to_minor_variants, axis=1)
    temp_df["a"] = a
    temp_df['minor_frequency'] = temp_df['a'].apply(lambda x: x[0])
    temp_df['consensus_base'] = temp_df['a'].apply(lambda x: x[1])
    temp_df['minor_base'] = temp_df['a'].apply(lambda x: x[2])
    temp_df['minor_nuc_muts'] = temp_df['consensus_base'] + temp_df['POS_x'].astype(int).astype(str) + temp_df['minor_base']
    
    temp_df.drop("a",axis=1,inplace=True)
    
    return(temp_df)

## Use metadata to add in other columns for location and other data

In [50]:
def add_clade(strain_name, metadata):
    if strain_name in metadata: 
        clade = metadata[strain_name]['clade']
    else:
        clade = "unknown"
    return(clade)

In [51]:
def add_location(strain_name, metadata):
    if strain_name in metadata: 
        location = metadata[strain_name]['location']
    else:
        location = "unknown"
    return(location)

In [52]:
def add_metadata_columns(df, metadata):
    temp_df = pd.DataFrame(df)
    location = temp_df['strain_name'].apply(add_location, args=[metadata])
    clade = temp_df['strain_name'].apply(add_clade, args=[metadata])
    #strain_name = temp_df['sampleid'].apply(lambda x: "USA/" + tube_number_conversion[x] + "/2020")
    temp_df["location"] = location
    temp_df["clade"] = clade
    return(temp_df)

## Read in homopolymer annotation module

In [3]:
%run homopolymer-module.ipynb

## Run all of the above together to output all of the necessary dataframes

In [5]:
def return_dataframes(metadata_file, clades_file, vcf_directory, to_ignore, fasta_file_path):
    strain_names_dict = read_strain_names_from_csv(metadata_file)
    metadata_dict = return_metadata_dict(metadata_file, clades_file)
    vcfs = return_list_of_vcfs(vcf_directory)
    
    # read in intersection snvs, format the columns, and return intersection snvs dataframe
    all_intersection_variants = read_in_intersection_snvs(vcfs, vcf_directory)
    all_intersection_variants = format_snp_column(all_intersection_variants)
    all_intersection_variants = add_in_strain_column(all_intersection_variants, strain_names_dict)
    all_intersection_variants = all_intersection_variants[~all_intersection_variants['sampleid'].isin(to_ignore)]
    
    # add in location and clade data
    all_intersection_variants = add_metadata_columns(all_intersection_variants, metadata_dict)
    
    # add in homopolymer annotation
    all_intersection_variants = add_homopolymer_annotation(fasta_file_path, all_intersection_variants)
    
    # separate out snvs and indels 
    snvs_only = separate_snvs(all_intersection_variants)
    indels_only = separate_indels(all_intersection_variants)
    
    # add in a column for the minor variants 
    snvs_only = add_minor_variant_column(snvs_only)
    indels_only = add_minor_variant_column(indels_only)
    
    # remove very low frequency variants
    snvs_only = snvs_only[snvs_only['minor_frequency'] >= 0.01]
    indels_only = indels_only[indels_only['minor_frequency'] >= 0.01]

    return(snvs_only, indels_only, all_intersection_variants, metadata_dict)