In [None]:
import pandas as pd

In [None]:
#this is a function that takes in a fasta file (where strain is the first field in header)
#it will only keep the duplicate with the longest sequence
def fastaDeDupeFile(fasta_file):

    #convert the fasta file into a df, where the header and sequence are two separate columns
    fasta_data = []
    with open(fasta_file, "r") as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip()
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence})

    df = pd.DataFrame(fasta_data)

    #making a new column for strain, extracted from the strain field (1st field) in the header
    df['strain'] = df['header'].str.split("|").str[0]
    
    # sorts in ascending order all the sequences by length, then makes unique
    #groups of strains, taking the last row that has the longest sequence
    df = df.iloc[df["sequence"].str.len().sort_values().index].groupby("strain").tail(1)

    #writing new fasta file where the header and sequence columns are turned back into rows  
     
    with open(fasta_file, "w") as f:
        for index, row in df.iterrows():
            f.write(f"{row['header']}\n")
            f.write(f"{row['sequence']}\n")

In [None]:
#this function takes in your gisaid and you genbank data, assuming all QC has been done on both, and appends 
#the gisaid data to the genbank, then calls the deDupe function 
def merge(fasta_file_gisaid, fasta_file_genbank, merged_file):
    with open(fasta_file_genbank + ".fa", 'r') as f2, open(fasta_file_gisaid + ".fa", 'r') as f1, open(merged_file + ".fa", 'a+') as f3:
        f3.write(f2.read())
        f3.write(f1.read())

    fastaDeDupeFile(merged_file + ".fa")