In [1]:
import pandas as pd
import os
import numpy as np
import re

In [2]:
#this converts fasta files into dataframes
def fasta_to_df(fasta_file):
    
    fasta_data = []
    
    with open(fasta_file) as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip() 
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence}) #last line
            
    return pd.DataFrame(fasta_data)

In [3]:
def fasta_writer(path, filename, df):
            
    try:  
        os.mkdir(path)

    except OSError as error:
        pass

    with open(f"{path}{filename}", "w") as f:
        for index, row in df.iterrows():
            f.write(f"{row['header']}\n")
            f.write(f"{row['sequence']}\n")

In [4]:
#the purpose of this function is to remove duplicates by keeping the longest sequence
#while also keeping the most complete date
#this is because different researchers sometimes upload the same strain but the date 
#will be incomplete in one vs the other

#by default, it also calls standardize_dates which makes sure all strains ACROSS fasta files
#have the most complete date associated with each strain

def fastaDeDupe(list_of_genes, input_path, standardize=True):
    
    genes = list_of_genes

    for gene in genes: #make sure your file names formatted as h3nx_[gene}.fasta
        
        df = fasta_to_df(f"{input_path}h3nx_{gene}.fasta")
        df['strain'] = df['header'].str.split("|").str[0].str.lower()
        df['date'] = df['header'].str.split("|").str[3]
        
        #to double check dupes were taken out
        # duplicated_strains = df[df.duplicated(subset="strain")][['strain','sequence']]
        # duplicated_strains.strain.unique()
        
        #group by sequence length and date completeness, it keeps the longest sequence and the 
        #most complete date
        temp_df = df.groupby(['strain']).agg({
            "sequence": lambda s: max(s, key=len),
            "date": lambda s: max(s.str.replace("XX", ""), key=len)
        })
        
        new_df = temp_df.merge(right=df, on=["strain", "sequence"], how="inner", suffixes=["", "_OLD"])
        
        #don't add the XXs back in if you want to make dates consistent across files
        if standardize == False:
            new_df['date'] = new_df['date'].str.replace("--", "-XX-XX")
            new_df['date'] = new_df['date'].str.replace("-$", "-XX", regex =True)
            new_df['date'] = new_df['date'].str.replace(r"^(\d{4}-\d{2})$" , r"\1" + "-XX", regex =True)
            new_df['date'] = new_df['date'].str.replace(r"^(\d{4})$" , r"\1" + "-XX" + "-XX", regex =True)
        
        new_df["header"]= new_df.apply(lambda x: x['header'].replace(str(x['date_OLD']), str(x['date'])), axis=1)
        
        new_df = new_df.loc[:,~new_df.columns.str.endswith('_OLD')]
        
        #if the date and sequence are the same between duplicates, it wont drop it above
        #this line will make sure ALL duplicates are finally dropped
        new_df.drop_duplicates(subset=['strain'], keep='first', inplace=True, ignore_index=True)
        
        fasta_writer(f"./deduped/", f"h3nx_{gene}.fasta", new_df) #it will make the folder if it doesnt exsist
        
    if standardize==True:
        standardize_dates(list_of_genes, "./deduped/", "./consistent/") #it will make the folder if it does not exist
    

In [5]:
def standardize_dates(list_of_genes,input_path, output_path):
    
    genes = list_of_genes
    all_unique_strains = set()
      
    #identify all unique strains across genes
    for gene in genes:
        df = fasta_to_df(f"{input_path}h3nx_{gene}.fasta")  #assuming deduplicated files are in the 'new' directory
        unique_strains = set(df['header'].str.split("|").str[0].str.lower())
        all_unique_strains.update(unique_strains)

    strains = all_unique_strains
    # print(len(strains))
    
    #determine most complete date for each strain
    #is this the most efficient way to do it?
    most_complete_dates = {}
    for strain in strains: #iterating ~12400 strains
        max_date = ""
        for gene in genes:
            df = fasta_to_df(f"{input_path}h3nx_{gene}.fasta")
            df['strain'] = df['header'].str.split("|").str[0].str.lower()
            df['date'] = df['header'].str.split("|").str[3]
            
            if strain in df['strain'].values:
                date_candidate = df.loc[df['strain'] == strain, 'date'].values[0]
                if len(date_candidate) > len(max_date):
                    max_date = date_candidate
                most_complete_dates[strain] = max_date

    
    # updating the dates for each strain
    for gene in genes:
        df = fasta_to_df(f"{input_path}/h3nx_{gene}.fasta")
        df['strain'] = df['header'].str.split("|").str[0].str.lower()
        df['date'] = df['header'].str.split("|").str[3]
        df['new_date'] = df['date']
        for strain, date in most_complete_dates.items():
            date = re.sub(r'--', '-XX-XX', date) #adding back in the X's
            date = re.sub(r'-$', '-XX', date)
            date = re.sub(r"^(\d{4}-\d{2})$" , r"\1" + "-XX", date)
            date = re.sub(r"^(\d{4})$" , r"\1" + "-XX" + "-XX", date)
            most_complete_dates[strain]=date
            sub_df=df[df["strain"]==strain].copy()
            sub_df["new_date"] = date
            df.update(sub_df)

        df["header"]= df.apply(lambda x: x['header'].replace(str(x['date']), str(x["new_date"])), axis=1)
        fasta_writer(f"{output_path}", f"h3nx_{gene}.fasta", df) 


In [6]:
def speciesClean(df):
    
    #print(df.Species.unique())
    
    species = pd.read_csv('species.csv')
    
    #theres probably a better way to do this
    df = df.merge(species,left_on = df["Species"].str.lower(), right_on= species["annotated"].str.lower(), how= "left")
    df['correction']=df['correction'].str.lower()
    df.loc[df['correction'].notnull(), 'Species'] = df['correction']
    df.drop(['correction', 'broad', 'annotated', 'order', 'key_0'], axis=1, inplace =True)
    
    df = df.merge(species,left_on = df["Species"].str.lower(), right_on= species["correction"].str.lower(), how= "left")
    df['Correction']=df['correction'].str.lower()
    df["Broad"] = df['broad']
    df["Order"] = df['order']
    
    #if you get an error that there are floats instead of strings, add these species to the species.csv
    print(df.Species.loc[df['correction'].isnull()].unique())
    
    df.drop(['correction', 'broad', 'annotated', 'order', 'key_0'], axis=1, inplace =True)
    
    # print(df.head())

    #merging will create duplicates
    df.drop_duplicates(subset=['Strain'], keep='first', inplace=True, ignore_index=True)
    
    return(df)

In [7]:
#this uses the master fasta file to parse out sequences by segment
#QC is done here but region and species data is done in the NCBI_QC function

def NCBI_virusPrep(fasta_file, gene_segment_map):
    
    df = fasta_to_df(fasta_file)
    
    df['header'] = df['header'].str.replace(' ', '_')
    df['Strain'] = df['header'].str.split("|").str[0]
    df['Accession'] = df['header'].str.split("|").str[1]
    df['Subtype'] = df['header'].str.split("|").str[2]
    df['Date'] = df['header'].str.split("|").str[3]
    df['Host'] = df['header'].str.split("|").str[4]
    df['Country'] = df['header'].str.split("|").str[5]
    df['Segment'] = df['header'].str.split("|").str[6]

    #QC steps specific to NCBI
    df = df[~df["Subtype"].str.contains("H3Nx|H3,mixed|mixed,H3|mixed,_H3|Mixed,H3|mixed.H3")]
    df.Accession = df.Accession.str[:-2]
    df['Strain'] = df['Strain'].str.replace('>Influenza_A_virus_', '', regex=False)
    df['Strain'] = df['Strain'].str.extract(r'(\(.*?\))')
    df['Strain'] = df['Strain'].str.replace('^\(', '>', regex=True)
    df['Strain'] = df['Strain'].str.replace('\(\w+\)', '', regex=True)
    
    # print(df.Host.unique())
    
    df['header'] = df[['Strain', 'Accession', 'Subtype', 'Date', 'Host', 'Country']].apply('|'.join, axis=1)
    
    for segment, gene in gene_segment_map.items():
        segment_df = df[df['Segment'] == segment]
        fasta_writer('./parsed/', f"h3nx_{gene}.fasta", segment_df)

In [8]:
gene_segment_map = {
    "1" : "pb2",
    "2" : "pb1",
    "3" : "pa",
    "4" : "ha",
    "5" : "np",
    "6" : "na",
    "7" : "mp",
    "8" : "ns"
}

In [15]:
NCBI_virusPrep("all_sequences.fasta", gene_segment_map) 

In [9]:
#this function should be run to finish the QC needed for nextstrain (QC folder)
#it also will dedupe the sequences (dedupe folder) and can standardize dates if needed (consistent)

def NCBI_QC(list_of_genes, input_path, standardize = True):
    
    genes = list_of_genes
    
    for gene in genes:
        
        df = fasta_to_df(f"{input_path}h3nx_{gene}.fasta")

        #this can be customized, change it based on how the headers are in your data
        df['header'] = df['header'].str.replace(' ', '_')
        df['Strain'] = df['header'].str.split("|").str[0]
        df['Accession'] = df['header'].str.split("|").str[1]
        df['Subtype'] = df['header'].str.split("|").str[2]
        df['Date'] = df['header'].str.split("|").str[3]
        df['Host'] = df['header'].str.split("|").str[4] #host is the latin name in NCBI
        df['Country'] = df['header'].str.split("|").str[5]
        df['Species'] = df['Strain'].str.split("/").str[1]
        
        #cleanup based on previous problems with data
        df.Country.replace('Viet_Nam', 'Vietnam' , inplace =True)    
        df = df[df["Country"] != ""]
        df = df[~df["Subtype"].str.contains("H3Nx|H3,mixed|mixed,H3|mixed,_H3|Mixed,H3|mixed.H3|H3N-")]
        df.Subtype.replace('H3N6,H3', 'H3N6', inplace =True)
        df = df[~df["Species"].str.contains("animal")]

        #adding region data
        regions = pd.read_csv('regions.csv')
        df = df.merge(regions,left_on = df["Country"].str.lower(), right_on= regions["country"], how= "left")
        df.drop(['key_0'], axis=1, inplace =True)
        #NCBI for some reason has latin names for host while genbank had "canine" or "swine"
        #standardizing that here

        avian_list = ['Duck', 'Swan', 'Meleagris_gallopavo', 'Spatula_cyanoptera', 'Aythya_affinis', 'Anatidae', 'Mareca_penelope', 'Spatula_discors', 'Goose', 'Other_avian', 'Chicken', 'Anas_platyrhynchos', 
                    'Anas_acuta', 'Turkey', 'Anas_discors', 'Anas_carolinensis', 
                    'Anas_clypeata', 'Anas_sp.', 'Arenaria_interpres', 'Anas_americana',
                    'Anas_rubripes', 'Anas_strepera', 'Anas_querquedula', 'Larus_atricilla',
                    'Guineafowl', 'Anas_crecca', 'Larus_hyperboreus', 'Gallus_gallus', 
                    'Calidris_canutus', 'Melanitta_nigra', 'Tadorna_feruginea', 'Tadorna_tadorna',
                    'Anser_caerulescens', 'Aythya_collaris', 'Anser_albifrons', 'Somateria_fischeri',
                    'Calidris_alpina', 'Anas_georgica', 'Chen_canagica', 'Larus_glaucescens',
                    'Anas_cyanoptera', 'Calidris_alba', 'Chroicocephalus_ridibundus',
                    'Leucophaeus_atricilla', 'Calidris_pusilla', 'Sandpiper', 'American_wigeon', 
                    'Mallard', 'Baikal_teal', 'Eurasian_curlew', 'Blue-winged_teal', 
                    'Anseriformes_sp.', 'Anas_platyrhynchos_var._domesticus', 
                    'Anas_platyrhynchos_x_Anas_acuta', 'Emperor_goose', 'American_black_duck', 
                    'Pink-eared_duck', 'Anser_cygnoides', 'Ruddy_turnstone', 'Common_teal',
                    'Northern_pintail', 'Cinnamon_Teal', 'Mallard_duck', 'Wild_waterfowl',
                    'Grey_teal', 'Bucephala_albeola', 'Wild_bird', 'Gull', 'Northern_shoveler', 
                    'Corvus_frugilegus', 'Branta_leucopsis', 'Oxyura_jamaicensis', 'Aix_sponsa', 'Cygnus_cygnus', 
                    'Coturnix', 'Larus_argentatus', 'Cairina_moschata', 'Pheasant', 'Greylag_goose',
                    'Wild_birds', 'Green-winged_teal', 'Teal', 'Anser_fabalis', 'Cygnus_columbianus', 
                    'Clangula_hyemalis', 'Netta_rufina']

        swine_list = ['Sus_scrofa_scrofa', 'Sus_scrofa', 'Sus_scrofa_domesticus', 'Pig']
        feline_list = ['Felis_catus']
        canine_list = ['Canis_lupus_familiaris']
        equine_list = ['Equus_caballus', 'Horse']

        df.loc[df['Host'].isin(avian_list), 'Host'] = 'Avian'
        df.loc[df['Host'].isin(swine_list), 'Host'] = 'Swine'
        df.loc[df['Host'].isin(equine_list), 'Host'] = 'Equine'
        df.loc[df['Host'].isin(feline_list), 'Host'] = 'Feline'
        df.loc[df['Host'].isin(canine_list), 'Host'] = 'Canine'
        df.loc[~df['Host'].isin(avian_list + swine_list + equine_list + feline_list + canine_list), 'Host'] = df['Host']
        
        df = speciesClean(df)
        
        df['header'] = df[['Strain', 'Accession', 'Subtype', 'Date', 'Host', 'country', 'region','Correction', 'Broad','Order']].apply('|'.join, axis=1)
        
        fasta_writer('./QC/', f"h3nx_{gene}.fasta", df)
    fastaDeDupe(list_of_genes, "./QC/", standardize)

In [47]:
NCBI_QC(list_of_genes, "./parsed/", True)

[]
[]
[]
[]
[]
[]
[]
[]


In [10]:
#this function should be run to finish the QC needed for nextstrain (QC folder)
#it also will dedupe the sequences (dedupe folder) and can standardize dates if needed (consistent)
def gisaidPrep(list_of_genes, metadata_path, input_path, output_path, dedupe = True, standardize = True):
    
    metadata = pd.read_csv(metadata_path)

    #replacing any spaces in the Isolate_Name column with underscores
    #adding the > character so that you can find matches in the .fasta file
    metadata['Isolate_Name'] = metadata['Isolate_Name'].str.replace(' ', '_')
    metadata['Isolate_Name'] = '>' + metadata['Isolate_Name'].astype(str)

    #extracting the country name as the second value in the location column
    #location is formatted continent/country/state/county)
    #drops any sequences where location or country data is not available
    metadata.dropna(subset=['Location'], inplace=True)
    metadata['Country'] = metadata['Location'].str.split('/').str[1].str.strip()
    metadata.dropna(subset=['Country'], inplace=True)

    genes = list_of_genes
    
    for gene in genes:
        
        df = fasta_to_df(f"{input_path}h3nx_{gene}.fasta")

        #make sure this matches your data
        df['Strain'] = df['header'].str.split("|").str[0]
        df['Accession'] = df['header'].str.split("|").str[1]
        df['Date'] = df['header'].str.split("|").str[2]

        #merging metadata with df on Isolate_Name column, adding metadata columns youre interested in
        merged = pd.merge(df, metadata[['Isolate_Name', 'Subtype', 'Country', 'Host']], left_on='Strain', right_on='Isolate_Name')
        
        #country + host QC and replacing spaces
        merged.Country.replace('United States', 'USA', inplace =True)
        merged.Country.replace('Korea, Republic of', 'South Korea' , inplace =True)
        merged.Country.replace('Russian Federation', 'Russia' , inplace =True)
        merged.Country.replace('Hong Kong (SAR)', 'Hong Kong', inplace =True)
        merged.Country.replace("Lao, People's Democratic Republic", "Laos", inplace =True)
        merged.Country = merged.Country.str.replace(' ', '_')
        merged.Host = merged.Host.str.replace(' ', '_')
        merged['Subtype'] = merged['Subtype'].str.replace('A / ', '', regex=False)
        merged['Species'] = merged['Strain'].str.split("/").str[1]
        merged = merged[~merged["Species"].str.contains("environment")]
        
        #adding region data
        regions = pd.read_csv('regions.csv')
        merged = merged.merge(regions,left_on = merged["Country"].str.lower(), right_on= regions["country"], how= "left")

        #Adding host data to match genbank host field

        avian_list = ['Duck', 'Swan', 'Goose', 'Other_avian', 'Chicken', 'Anas_platyrhynchos', 
                    'Anas_acuta', 'Turkey', 'Anas_discors', 'Anas_carolinensis', 
                    'Anas_clypeata', 'Anas_sp.', 'Arenaria_interpres', 'Anas_americana',
                    'Anas_rubripes', 'Anas_strepera', 'Anas_querquedula', 'Larus_atricilla',
                    'Guineafowl', 'Anas_crecca', 'Larus_hyperboreus', 'Gallus_gallus', 
                    'Calidris_canutus', 'Melanitta_nigra', 'Tadorna_feruginea', 'Tadorna_tadorna',
                    'Anser_caerulescens', 'Aythya_collaris', 'Anser_albifrons', 'Somateria_fischeri',
                    'Calidris_alpina', 'Anas_georgica', 'Chen_canagica', 'Larus_glaucescens',
                    'Anas_cyanoptera', 'Calidris_alba', 'Chroicocephalus_ridibundus',
                    'Leucophaeus_atricilla', 'Calidris_pusilla', 'Sandpiper', 'American_wigeon', 
                    'Mallard', 'Baikal_teal', 'Eurasian_curlew', 'Blue-winged_teal', 
                    'Anseriformes_sp.', 'Anas_platyrhynchos_var._domesticus', 
                    'Anas_platyrhynchos_x_Anas_acuta', 'Emperor_goose', 'American_black_duck', 
                    'Pink-eared_duck', 'Anser_cygnoides', 'Ruddy_turnstone', 'Common_teal',
                    'Northern_pintail', 'Cinnamon_Teal', 'Mallard_duck', 'Wild_waterfowl',
                    'Grey_teal', 'Bucephala_albeola', 'Wild_bird', 'Gull', 'Northern_shoveler', 
                    'Corvus_frugilegus', 'Branta_leucopsis', 'Oxyura_jamaicensis', 'Aix_sponsa', 'Cygnus_cygnus', 
                    'Coturnix', 'Larus_argentatus', 'Cairina_moschata', 'Pheasant', 'Greylag_goose',
                    'Wild_birds', 'Green-winged_teal', 'Teal', 'Anser_fabalis', 'Cygnus_columbianus', 
                    'Clangula_hyemalis', 'Netta_rufina', "Penguin", "Pigeon"]

        swine_list = ['Sus_scrofa_scrofa', 'Sus_scrofa', 'Sus_scrofa_domesticus', 'Pig']
        feline_list = ['Felis_catus']
        canine_list = ['Canis_lupus_familiaris']
        equine_list = ['Equus_caballus', 'Horse']

        merged.loc[merged['Host'].isin(avian_list), 'Host_Type'] = 'Avian'
        merged.loc[merged['Host'].isin(swine_list), 'Host_Type'] = 'Swine'
        merged.loc[merged['Host'].isin(equine_list), 'Host_Type'] = 'Equine'
        merged.loc[merged['Host'].isin(feline_list), 'Host_Type'] = 'Feline'
        merged.loc[merged['Host'].isin(canine_list), 'Host_Type'] = 'Canine'
        merged.loc[~merged['Host'].isin(avian_list + swine_list + equine_list + feline_list + canine_list), 'Host_Type'] = merged['Host']
        

        merged.drop(['key_0'], axis=1, inplace =True)
        merged = speciesClean(merged)

        #the fields are in the same order as in the ncbi QC, just named differently
        merged['header'] = merged[['Strain', 'Accession', 'Subtype', 'Date', 'Host_Type', 'Country', 'region','Correction', 'Broad','Order']].apply('|'.join, axis=1)
        
        fasta_writer(f"{output_path}", f"h3nx_{gene}.fasta", merged) 

    if dedupe:
        fastaDeDupe(list_of_genes, output_path, standardize)

In [101]:
gisaidPrep(list_of_genes, "gisaid_metadata.csv", "./", "./gisaid/", True, True)

[]
[]
[]
[]
[]
[]
[]
[]


In [11]:
#this function takes in your gisaid and you genbank data, assuming all QC has been done on both, and appends 
#the gisaid data to the genbank, then calls the deDupe function 
def merge(list_of_genes, gisaid_path, NCBI_path, merged_path, dedupe=True, standardize=True):
    
    genes = list_of_genes
    
    try:  
        os.mkdir(merged_path)

    except OSError as error:
        pass
    
    for gene in genes:
        with open(f"{NCBI_path}h3nx_{gene}.fasta" , 'r') as f2, open(f"{gisaid_path}h3nx_{gene}.fasta", 'r') as f1, open(f"{merged_path}h3nx_{gene}.fasta", 'a+') as f3:
            f3.write(f2.read())
            f3.write(f1.read())

    if dedupe:
        fastaDeDupe(list_of_genes, merged_path, standardize)

In [12]:
list_of_genes = ["ha", "pb1", "pb2","pa","mp","np","na","ns"]

In [28]:
merge(list_of_genes,
      "./june-17-2024-gisaid/consistent/", 
      "./june-6-2024-ncbi/consistent/",
      "./merged/")

In [14]:
#this function takes in your gisaid and you genbank data, assuming all QC has been done on both, and appends 
#the gisaid data to the genbank, then calls the deDupe function 
def mergeWithMA(list_of_genes, new_sequence_path, current_sequences_path, merged_path, dedupe=True, standardize=True):
    
    genes = list_of_genes
    
    try:  
        os.mkdir(merged_path)

    except OSError as error:
        pass
    
    for gene in genes:
        with open(f"{new_sequence_path}h3nx_{gene}.fasta" , 'r') as f2, open(f"{current_sequences_path}h3nx_{gene}.fasta", 'r') as f1, open(f"{merged_path}h3nx_{gene}.fasta" , 'a+') as f3:
            f3.write(f2.read())
            f3.write(f1.read())

    if dedupe:
        fastaDeDupe(list_of_genes, merged_path, standardize)

In [16]:
mergeWithMA(list_of_genes,
      "/Users/monclalab1/Documents/nonhuman_H3_project/non-human-h3/conditon_on_ha/sequences/", 
      "./june-2024/consistent/",
      "./merged/")