In [None]:
# SeqIO is a module developed for working with fasta files and sequences. It is part of the larger package called Bio 
from Bio import SeqIO

In [1]:
from Bio import SeqIO

# File paths
InFileName = "data/metadata-with-megaclade.tsv"  # Metadata file with strain names and dates
path_to_fasta_files = "data/"  # Directory for FASTA files
SEGMENTS = ["ha", "na", "mp", "pb1", "pa", "pb2", "np", "ns"]

# Open the metadata file
InFile = open(InFileName, 'r')

# Initialize an empty dictionary to store strain names and corresponding dates
metadata_dict = {}

# Read header and get indexes for strain and date columns
Header = InFile.readline().strip().split('\t')
strain_index = Header.index("strain")  # Index of the strain name in metadata
date_index = Header.index("date")  # Index of the date in metadata

# Process the metadata and store the strain name and corresponding date
for Line in InFile:
    ElementList = Line.strip().split('\t')  # Split the line into columns

    #need it to read the date column (4th column) = [3] and replace XX with 01
    H5RawDate = ElementList[3]
    FixedDate = H5RawDate.replace('XX', '01')
    
    strain_name = ElementList[strain_index]  # Get the strain name
    date = ElementList[date_index]  # Get the date
    
    # Add to dictionary
    metadata_dict[strain_name] = date

# Close the metadata file after processing
InFile.close()

# Loop through each gene segment and process the FASTA files
for gene in SEGMENTS:
    # Define input and output FASTA file paths
    input_fasta = path_to_fasta_files + "h5nx_" + gene + ".fasta"
    output_fasta = path_to_fasta_files + "updated/h5nx_" + gene + ".fasta"  # Output file with updated strain names

    # Open the output FASTA file for writing
    OutFile = open(output_fasta, 'w')

    # Parse each sequence in the input FASTA file
    for seq in SeqIO.parse(input_fasta, "fasta"):
        strain_name = seq.id  # Extract the strain name from the FASTA header
        
        # Check if strain name exists in the metadata dictionary
        if strain_name in metadata_dict:
            # Append the corresponding date to the strain name with a pipe separator
            updated_strain_name = f"{strain_name}|{metadata_dict[strain_name]}"
            seq.id = updated_strain_name  # Update the strain name in the sequence object
            seq.description = updated_strain_name
        else: 
            print(f"Strain not found in metadata: {strain_name}")
        # Write the updated sequence to the output file
        SeqIO.write(seq, OutFile, "fasta")

    # Close the output file after writing all sequences
    OutFile.close()

    print(f"Updated FASTA file for gene {gene} has been saved as {output_fasta}.")


Updated FASTA file for gene ha has been saved as data/updated/h5nx_ha.fasta.
Updated FASTA file for gene na has been saved as data/updated/h5nx_na.fasta.
Updated FASTA file for gene mp has been saved as data/updated/h5nx_mp.fasta.
Updated FASTA file for gene pb1 has been saved as data/updated/h5nx_pb1.fasta.
Updated FASTA file for gene pa has been saved as data/updated/h5nx_pa.fasta.
Updated FASTA file for gene pb2 has been saved as data/updated/h5nx_pb2.fasta.
Updated FASTA file for gene np has been saved as data/updated/h5nx_np.fasta.
Updated FASTA file for gene ns has been saved as data/updated/h5nx_ns.fasta.


In [None]:
# #This script is meant to append the corresponding date to the end of the strain name with a pipe
# #Backbone written by Louise

# #Read the metadata file with updated dates
# #Write the date to the end of the strain name in each of the fasta files 


# #File paths
# InFileName = "metadata-with-megaclade-updateddates.tsv" #MEGACLADE column included in the metadata
# InFile = open(InFileName, 'r')

# SEGMENTS = ["ha","na","mp","pb1","pa","pb2","np","ns"]
# path_to_fasta_files = "data/"

# #at the end of this for loop I want to update/write into these fasta files the data appended to the strain name
# for gene in SEGMENTS: 
#     # reads in a fasta file, and iterates through every sequence in it
#     input_fasta = path_to_fasta_files + "h5nx_"+ gene + ".fasta"
#     OutFile = open(input_fasta, 'w') #opens all of the fasta files 

#     for Line in InFile: 
#     if LineNumber > 0:
#         Line = Line.strip('\n') #remove line ending characters (is this necessary?)
#         ElementList = Line.split('\t') #not necessary since tsv = tab separated value? (okay maybe necessary, keep it in)
#             # seq.id = the name, seq.seq = the sequence itself 
#             for seq in SeqIO.parse(input_fasta, "fasta"): #tells SeqIO.parse() to expect a fasta file
#                 sequence = str(seq.id)  #seq.id is an object - this converts it to a string
#                 sequence = ElementList[0]
#                 print(sequence[0])

# InFile.close() 

    
        # if sequence.endswith(gene): 
        #     InFile.write(seq.id + "\n")
        #     # print(seq.id, "gene name at end of sequence")
        #     count += 1
        # elif gene in sequence: 
        #     InFile.write(seq.id + "\n")
        #     # print(seq.id, "gene within sequence")
        #     count +=1
        # else: 
        #     pass
    # print(count)

