In [3]:
import pandas as pd


In [4]:
#this will only keep the duplicate with the longest sequence
def fastaDeDupeDF(df):
    
    #making a new column for strain, extracted from the strain field (1st field) in the header
    df['strain'] = df['header'].str.split("|").str[0]

    # df.duplicated returns boolean Series denoting duplicate rows in the strain column, if you would like to see which ones they are
    # duplicated_strains = df[df.duplicated(subset="strain")][['strain','sequence']]
    # print(duplicated_strains.strain.unique())
    
    #sorts in ascending order all the sequences by length, then makes unique
    #groups of strains, taking the last row that has the longest sequence
    df = df.iloc[df["sequence"].str.len().sort_values().index].groupby("strain").tail(1)
    return(df)

In [None]:
def genbankPrep(fasta_file,fasta_output):

    # metadata file given by genbank needs some initial cleaning up; name = strain
    # metadata= pd.read_csv(metadata_file)
    # metadata['name'] = metadata['name'].str.replace(' ', '_')
    # metadata['name'] = '>' + metadata['name'].astype(str)

    #convert the fasta file into a df, where the header and sequence are two separate columns
    fasta_data = []
    with open(fasta_file, "r") as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip()
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence})

    df = pd.DataFrame(fasta_data)

    #making a new column for each field
    #this allows you to be flexible with the header

    df['header'] = df['header'].str.replace(' ', '_')
    df['Strain'] = df['header'].str.split("|").str[0]
    df['Accession'] = df['header'].str.split("|").str[1]
    df['Subtype'] = df['header'].str.split("|").str[2]
    df['Date'] = df['header'].str.split("|").str[3]
    df['Host'] = df['header'].str.split("|").str[4]
    df['Country'] = df['header'].str.split("|").str[5]

    df.Country.replace('Viet_Nam', 'Vietnam' , inplace =True)    
    df = df[df["Country"] != ""]
    df = df[~df["Strain"].str.contains("Equine_influenza_virus_H3N8")]
    df = df[~df["Subtype"].str.contains("H3Nx|H3,mixed|mixed,H3|mixed,_H3|Mixed,H3|mixed.H3")]
    df.Subtype.replace('H3N6,H3', 'H3N6', inplace =True)

    df['Species'] = df['Strain'].str.split("/").str[1]

    #adding region data
    regions = pd.read_csv('regions.csv')
    df = df.merge(regions,left_on = df["Country"].str.lower(), right_on= regions["country"], how= "left")
    
    df['header'] = df[['Strain', 'Accession', 'Subtype', 'Date', 'Host', 'country', 'Species', 'region']].apply('|'.join, axis=1)
    #this drops all duplicates, keeping the longest sequence, will throw errors if there are no duplicates present
    fastaDeDupeDF(df)

    with open(fasta_output, "w") as f:
        for index, row in df.iterrows():
            f.write(f"{row['header']}\n")
            f.write(f"{row['sequence']}\n")

In [None]:
def gisaidPrep(fasta_file,fasta_output):
    metadata = pd.read_csv('H3Nx-GISAID-all-metadata.tsv', sep='\t')

    #replacing any spaces in the Isolate_Name column with underscores and adding the > character so that you can find matches in the fa file
    metadata['Isolate_Name'] = metadata['Isolate_Name'].str.replace(' ', '_')
    metadata['Isolate_Name'] = '>' + metadata['Isolate_Name'].astype(str)

    #extracting the country name as the second value in the location column (location is formatted continent/country/state/county)
    #drops any sequences where location or country data is not available
    metadata.dropna(subset=['Location'], inplace=True)
    metadata['Country'] = metadata['Location'].str.split('/').str[1].str.strip()
    metadata.dropna(subset=['Country'], inplace=True)

    #convert the fasta file into a df, where the header and sequence are two separate columns
    fasta_data = []
    with open(fasta_file, "r") as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip()
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence})

    df = pd.DataFrame(fasta_data)

    #making a new column for each field. this allows you to be flexible with the header
    df['Strain'] = df['header'].str.split("|").str[0]
    df['Accession'] = df['header'].str.split("|").str[1]
    df['Segment'] = df['header'].str.split("|").str[2]
    df['Date'] = df['header'].str.split("|").str[3]

    #dropping the duplicates, keeping the longest sequence, will throw errors if there are no duplicates present
    fastaDeDupeDF(df)

    #merging metadata with df on Isolate_Name column, adding metadata columns youre interested in
    merged = pd.merge(df, metadata[['Isolate_Name', 'Subtype', 'Country', 'Host']], left_on='Strain', right_on='Isolate_Name')

    merged.Segment = merged.Subtype
    merged.Accession = "EPI" + merged.Accession

    #country + host QC and replacing spaces
    merged.Country.replace('United States', 'USA', inplace =True)
    merged.Country.replace('Korea, Republic of', 'South Korea' , inplace =True)
    merged.Country.replace('Russian Federation', 'Russia' , inplace =True)
    merged.Country.replace('Hong Kong (SAR)', 'Hong Kong', inplace =True)
    merged.Country.replace("Lao, People's Democratic Republic", "Laos", inplace =True)
    merged.Country = merged.Country.str.replace(' ', '_')
    merged.Host = merged.Host.str.replace(' ', '_')
    
    #adding region data
    regions = pd.read_csv('regions.csv')
    merged = merged.merge(regions,left_on = merged["Country"].str.lower(), right_on= regions["country"], how= "left")

    #Adding host data to match genbank host field
    
    avian_list = ['Duck', 'Swan', 'Goose', 'Other_avian', 'Chicken', 'Anas_platyrhynchos', 
                'Anas_acuta', 'Turkey', 'Anas_discors', 'Anas_carolinensis', 
                'Anas_clypeata', 'Anas_sp.', 'Arenaria_interpres', 'Anas_americana',
                'Anas_rubripes', 'Anas_strepera', 'Anas_querquedula', 'Larus_atricilla',
                'Guineafowl', 'Anas_crecca', 'Larus_hyperboreus', 'Gallus_gallus', 
                'Calidris_canutus', 'Melanitta_nigra', 'Tadorna_feruginea', 'Tadorna_tadorna',
                'Anser_caerulescens', 'Aythya_collaris', 'Anser_albifrons', 'Somateria_fischeri',
                'Calidris_alpina', 'Anas_georgica', 'Chen_canagica', 'Larus_glaucescens',
                'Anas_cyanoptera', 'Calidris_alba', 'Chroicocephalus_ridibundus',
                'Leucophaeus_atricilla', 'Calidris_pusilla', 'Sandpiper', 'American_wigeon', 
                'Mallard', 'Baikal_teal', 'Eurasian_curlew', 'Blue-winged_teal', 
                'Anseriformes_sp.', 'Anas_platyrhynchos_var._domesticus', 
                'Anas_platyrhynchos_x_Anas_acuta', 'Emperor_goose', 'American_black_duck', 
                'Pink-eared_duck', 'Anser_cygnoides', 'Ruddy_turnstone', 'Common_teal',
                'Northern_pintail', 'Cinnamon_Teal', 'Mallard_duck', 'Wild_waterfowl',
                'Grey_teal', 'Bucephala_albeola', 'Wild_bird', 'Gull', 'Northern_shoveler', 
                'Corvus_frugilegus', 'Branta_leucopsis', 'Oxyura_jamaicensis', 'Aix_sponsa', 'Cygnus_cygnus', 
                'Coturnix', 'Larus_argentatus', 'Cairina_moschata', 'Pheasant', 'Greylag_goose',
                'Wild_birds', 'Green-winged_teal', 'Teal', 'Anser_fabalis', 'Cygnus_columbianus', 
                'Clangula_hyemalis', 'Netta_rufina']

    swine_list = ['Sus_scrofa_scrofa', 'Sus_scrofa', 'Sus_scrofa_domesticus', 'Pig']
    feline_list = ['Felis_catus']
    canine_list = ['Canis_lupus_familiaris']
    equine_list = ['Equus_caballus', 'Horse']

    merged.loc[merged['Host'].isin(avian_list), 'Host_Type'] = 'Avian'
    merged.loc[merged['Host'].isin(swine_list), 'Host_Type'] = 'Swine'
    merged.loc[merged['Host'].isin(equine_list), 'Host_Type'] = 'Equine'
    merged.loc[merged['Host'].isin(feline_list), 'Host_Type'] = 'Feline'
    merged.loc[merged['Host'].isin(canine_list), 'Host_Type'] = 'Canine'
    merged.loc[~merged['Host'].isin(avian_list + swine_list + equine_list + feline_list + canine_list), 'Host_Type'] = merged['Host']
    print(merged.Host_Type.unique())

    merged['header'] = merged[['Strain', 'Accession', 'Segment', 'Date', 'Host_Type', 'country', 'Host', 'region']].apply('|'.join, axis=1)

    #writing new fasta file where the header and sequence columns are turned back into rows  
    with open(fasta_output, "w") as f:
        for index, row in merged.iterrows():
            f.write(f"{row['header']}\n")
            f.write(f"{row['sequence']}\n")

In [None]:
def speciesClean(fasta_input,fasta_output):

    #convert the fasta file into a df, where the header and sequence are two separate columns
    #this assumes that you have a Species field (here it is coded as field 7) 
    #in the header that needs cleaning up. the header must have no spaces, only underscores
    fasta_data = []
    with open(fasta_input + ".fa", "r") as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip()
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence})

    df = pd.DataFrame(fasta_data)

    df['Strain'] = df['header'].str.split("|").str[0]
    df['Accession'] = df['header'].str.split("|").str[1]
    df['Subtype'] = df['header'].str.split("|").str[2]
    df['Date'] = df['header'].str.split("|").str[3]
    df['Host'] = df['header'].str.split("|").str[4]
    df['Country'] = df['header'].str.split("|").str[5]
    df['Species'] = df['header'].str.lower().str.split("|").str[6]
    df['Region'] = df['header'].str.lower().str.split("|").str[7]
    
    #print(df.Species.unique())
    
    species = pd.read_csv('species.csv')

    #some cleanup
    df = df[~df["Species"].str.contains("animal")]
    df = df[~df["Subtype"].str.contains("H3Nx|H3,mixed|mixed,H3|mixed,_H3|Mixed,H3|mixed.H3")]
    df.Subtype.replace('H3N6,H3', 'H3N6', inplace =True)
    
    #theres probably a better way to do this

    df = df.merge(species,left_on = df["Species"].str.lower(), right_on= species["annotated"].str.lower(), how= "left")
    df['correction']=df['correction'].str.lower()
    df.loc[df['correction'].notnull(), 'Species'] = df['correction']
    df.drop(['correction', 'broad', 'annotated', 'order', 'key_0'], axis=1, inplace =True)
    df = df.merge(species,left_on = df["Species"].str.lower(), right_on= species["correction"].str.lower(), how= "left")
    
    df['Correction']=df['correction'].str.lower()
    df["Broad"] = df['broad']
    df["Order"] = df['order']
    df.drop(['key_0'], axis=1, inplace =True)

    #if you get an error that there are floats instead of strings, add these species to the species.csv
    print(df.Species.loc[df['correction'].isnull()].unique())

    #merging will create duplicates
    df.drop_duplicates(subset=['Strain'], keep='first', inplace=True, ignore_index=True)
    
    df['header'] = df[['Strain', 'Accession', 'Subtype', 'Date', 'Host', 'Country', 'Region', 'Correction', 'Broad','Order']].apply('|'.join, axis=1)

    with open(fasta_output, "w") as f:
         for index, row in df.iterrows():
             f.write(f"{row['header']}\n")
             f.write(f"{row['sequence']}\n")