In [1]:
import pandas as pd
import os 

In [1]:
#treesort_prep function was needed before the condition_on_ha build was made. this code 
#ensures that strains are consistent across all segments
#however, date_updater is still needed because treesort does not allow XX-XX-XXXX dates

In [2]:
#this converts fasta files into dataframes

def fasta_to_df(fasta_file):
    
    fasta_data = []
    
    with open(fasta_file) as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip() 
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence}) #last line
            
    return pd.DataFrame(fasta_data)

In [3]:
def fasta_writer(path, filename, df):
            
    try:  
        os.mkdir(path)

    except OSError as error:
        pass

    with open(f"{path}{filename}", "w") as f:
        for index, row in df.iterrows():
            f.write(f"{row['header']}\n")
            f.write(f"{row['sequence']}\n")

In [4]:
def deduper(list_of_genes):
    
    genes = list_of_genes
    clades = list_of_clades
    
    for clade in clades:
        for gene in genes:

            df = fasta_to_df(f"./{clade}/h3nx_{gene}.fa")

            #print(f"{gene} {df.shape}")

            df['strain'] = df['header'].str.split("|").str[0].str.lower()

            df = df.drop_duplicates(subset=['strain'])

            #print(f"{gene} {df.shape}")

            fasta_writer(f"./{clade}/", f"h3nx_{gene}.fasta", df)

In [25]:
#this ensures that all 7 segments include all strains that are present in the 
#HA clades to effectively capture reassortment events
# only keep strains that are the intersection of all segments

def treesort_prep(list_of_clades, list_of_genes):

    clades = list_of_clades
    genes = list_of_genes
    
    for clade in clades:
        
        df_ha = fasta_to_df(f"./{clade}/h3nx_ha.fasta")
        df_ha['Strain'] = df_ha['header'].str.split("|").str[0]
        # print(f"{clade} and gene ha: {df_ha.shape[0]}")
        unique_strains = []
        
        for gene in genes:
            master_file = f"./sequences/h3nx_{gene}.fasta"
            
            df_master = fasta_to_df(master_file)
            df_master['Strain'] = df_master['header'].str.split("|").str[0]
            
            #this makes it so each segment has the strains present in the HA clade
            new_df = df_master[df_master["Strain"].isin(df_ha.Strain.values)]
            # print(f"{clade} and {gene}: {new_df.shape[0]}")
            
            unique_strains.append(set(new_df.Strain.unique()))
        
        shared_strains = set.intersection(*unique_strains) #the * unpacks the list
        # print(f"shared strains: {len(shared_strains)}")
        
        for gene in genes:
            master_file = f"./sequences/h3nx_{gene}.fasta"
            
            df_master = fasta_to_df(master_file)
            df_master['Strain'] = df_master['header'].str.split("|").str[0]
            
            #this makes it so each segment has the strains present in the HA clade
            new_df = df_master[df_master["Strain"].isin(df_ha.Strain.values)]
            
            #updating each fasta file to have only the intersection
            shared_strains_df = new_df[new_df["Strain"].isin(shared_strains)]
            
            fasta_writer(f"./{clade}_treesort/", f"h3nx_{gene}.fasta",  shared_strains_df)
            
        ha_updated = df_ha[df_ha["Strain"].isin(shared_strains)]
        fasta_writer(f"./{clade}_treesort/", f"h3nx_ha.fasta", ha_updated)

In [6]:
def date_updater(list_of_clades, list_of_genes):
    
    clades = list_of_clades
    genes = list_of_genes
    
    for clade in clades:
        for gene in genes:
            
            df = fasta_to_df(f"./treesort/{clade}_treesort/h3nx_{gene}.fasta")
            df['strain'] = df['header'].str.split("|").str[0]
            df['1'] = df['header'].str.split("|").str[1]
            df['2'] = df['header'].str.split("|").str[2]
            df['date'] = df['header'].str.split("|").str[3]
            df['4'] = df['header'].str.split("|").str[4]
            df['5'] = df['header'].str.split("|").str[5]
            df['6'] = df['header'].str.split("|").str[6]
            df['7'] = df['header'].str.split("|").str[7]
            df['8'] = df['header'].str.split("|").str[8]
            df['9'] = df['header'].str.split("|").str[9]
            
            df["date"] = df["date"].str.replace('XX', '01')
            df['strain'] = df[["strain", "date"]].apply('|'.join, axis=1)
            
            df['header'] = df[['strain', '1', '2', 'date', 
                               '4', '5', '6', '7', '8' , '9']].apply('^'.join, axis=1)
            
            fasta_writer(f"./treesort/{clade}_treesort/updated/", f"h3nx_{clade}_{gene}.fasta", df)
            

In [21]:
list_of_clades = ["canineH3N2", "human", "euroSwine", "chinaVietSwine", "thaiSwine", "NA1995Swine"]

list_of_genes = ["ha", "pb2","pb1","na","np","pa","ns","mp"] #remove HA when you are running treesort_prep


In [20]:
deduper(list_of_genes)

In [27]:
treesort_prep(list_of_clades, list_of_genes)

In [28]:
date_updater(list_of_clades, list_of_genes)