In [2]:
import re
import os
import pandas as pd
from Bio import SeqIO
from datetime import datetime

In [3]:
# This script is designed to filter a metadata file using just the headers in a FASTA

In [4]:
#Function: Read in the FASTA as a dataframe
def fasta_reader(path_to_fasta, output_name):
    fasta_data = []
    FASTANAME = path_to_fasta
        
    with open(FASTANAME) as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip() 
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence}) #last line 
        
    globals()[output_name] = pd.DataFrame(fasta_data)

    return

In [5]:
# fasta_reader("./Nextstrain/11182024_NAm_aves-wild_no-bvbrc-SRA_contphylo_test/Nextstrain_results/aligned_ha.fasta", 
#              "ha_alignment")

# fasta_reader("./Nextstrain/12022024_NAm_aves-wild_contphylo/Nextstrain_results/aligned_ha.fasta", 
#              "ha_alignment")

# fasta_reader("./Nextstrain/01072025_NAm_aves-wild_division2022/Nextstrain_results/aligned_ha.fasta", 
#              "ha_alignment")

# fasta_reader("./BEAST/continuous_phylogeography/submission_files/2025-02-25_2022-2024_ha_Nextstrain-tree/aligned_ha.fasta",
#             "ha_alignment")

fasta_reader("/Users/claramal/Desktop/Moncla_Lab/BEAST_tests/thorney-tree-model_TEST/test_5/H5N1 Thorney Beast/2022-2024_test_contphyl/aligned_ha_timetree.fasta",
            "ha_alignment")

In [6]:
ha_alignment.head()

Unnamed: 0,header,sequence
0,>a/americanwigeon/southcarolina/22001743011/20...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...
1,>a/baldeagle/newyork/22013006001/2022|2022-04-...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...
2,>a/baldeagle/northcarolina/22006530002/2022|20...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...
3,>a/greenwingedteal/northcarolina/22004372001/2...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...
4,>a/mallard/newjersey/22005110001/2022|2022-02-...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...


In [7]:
# Function: remove the carrot from the header

def header_to_strain(df):
    df["strain"] = df["header"].str.replace(">","")

    return(df)

In [8]:
# Remove the carrot from the header

ha_alignment_strain = header_to_strain(ha_alignment)

ha_alignment_strain.head()

Unnamed: 0,header,sequence,strain
0,>a/americanwigeon/southcarolina/22001743011/20...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...,a/americanwigeon/southcarolina/22001743011/202...
1,>a/baldeagle/newyork/22013006001/2022|2022-04-...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...,a/baldeagle/newyork/22013006001/2022|2022-04-2...
2,>a/baldeagle/northcarolina/22006530002/2022|20...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...,a/baldeagle/northcarolina/22006530002/2022|202...
3,>a/greenwingedteal/northcarolina/22004372001/2...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...,a/greenwingedteal/northcarolina/22004372001/20...
4,>a/mallard/newjersey/22005110001/2022|2022-02-...,ATGGAGAACATAGTACTACTTCTTGCAATAGTTAGCCTTGTTAAAA...,a/mallard/newjersey/22005110001/2022|2022-02-0...


In [9]:
print(len(ha_alignment))

4774


In [10]:
# Upload the metadata file to be parsed

#FILE = "./metadata/sequence_metadata/NCBI+BVBRC+GIS_ha_latlong_subset.tsv"
# FILE = "./BEAST/continuous_phylogeography/submission_files/2025-02-25_2022-2024_ha_Nextstrain-tree/NCBI-BVBRC-GIS_ha_detectionsgeodata.tsv"
FILE = "./metadata/sequence_metadata/old_download_2025-01-21/NCBI-BVBRC-GIS_ha_detectionsgeodata.tsv"

taxa_lat_long = pd.read_csv(FILE, sep = "\t")

In [11]:
def parse_metadata(metadata, fasta_df_to_parse):
    # Create a set of strains from fasta_df_to_parse
    strains_set = set(fasta_df_to_parse["strain"])

    # Filter rows where the 'taxon' value is in the strains_set
    parsed_metadata = metadata[metadata["strain"].isin(strains_set)]

    return parsed_metadata

In [12]:
taxa_lat_long.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,strain_segment,Genome ID,SRA Accession,strain,source_merge2,bvbrc_assembled,strain_segment_updated,genbank_acc,...,year_only,month_only,county,city,county_lat-long,city_lat-long,lat_long_to_use,lat_long_source,lat,long
0,2,19,a/americanblackduck/connecticut/22005000001/20...,11320.455519,,a/americanblackduck/connecticut/22005000001/20...,ncbi-bvbrc,no,a/americanblackduck/connecticut/22005000001/20...,OQ957612,...,2022.0,2.0,new_london,unknown,"41.47, -72.1",unknown,"41.47,-72.1",county,41.47,-72.1
1,3,27,a/americanblackduck/connecticut/22005313006/20...,11320.455526,,a/americanblackduck/connecticut/22005313006/20...,ncbi-bvbrc,no,a/americanblackduck/connecticut/22005313006/20...,OQ957620,...,2022.0,2.0,new_haven,unknown,"41.35, -72.9",unknown,"41.35,-72.9",county,41.35,-72.9
2,4,35,a/americanblackduck/connecticut/22005313007/20...,11320.455535,,a/americanblackduck/connecticut/22005313007/20...,ncbi-bvbrc,no,a/americanblackduck/connecticut/22005313007/20...,OQ957628,...,2022.0,2.0,new_haven,unknown,"41.35, -72.9",unknown,"41.35,-72.9",county,41.35,-72.9
3,5,43,a/americanblackduck/connecticut/22005313008/20...,11320.455544,,a/americanblackduck/connecticut/22005313008/20...,ncbi-bvbrc,no,a/americanblackduck/connecticut/22005313008/20...,OQ957636,...,2022.0,2.0,new_haven,unknown,"41.35, -72.9",unknown,"41.35,-72.9",county,41.35,-72.9
4,6,83,a/americanblackduck/indiana/23034359024origina...,,,a/americanblackduck/indiana/23034359024/2023|2...,gisaid,no,a/americanblackduck/indiana/23034359024/2023_4,,...,2023.0,11.0,starke,unknown,"41.284478, -86.644636",unknown,"41.284478,-86.644636",county,41.284478,-86.644636


In [13]:
parsed_taxa_lat_long = parse_metadata(taxa_lat_long, ha_alignment_strain)

In [14]:
print(len(parsed_taxa_lat_long))

4774


In [15]:
# parsed_taxa_lat_long.to_csv("./BEAST/continuous_phylogeography/submission_files/11262024_test/ha_taxa_lat-long.tsv", 
#                             sep = "\t")

# parsed_taxa_lat_long.to_csv("./BEAST/continuous_phylogeography/submission_files/12032024_weeklyskygrid//ha_taxa_lat-long.tsv", 
#                             sep = "\t")

# parsed_taxa_lat_long.to_csv("./BEAST/continuous_phylogeography/submission_files/01082025_2022_skygrid/2022_ha_lat-long.tsv", 
#                             sep = "\t")

# parsed_taxa_lat_long.to_csv("./BEAST/continuous_phylogeography/submission_files/2025-02-25_2022-2024_ha_Nextstrain-tree/2022-2024_ha_latlongs.tsv", 
#                             sep = "\t")

parsed_taxa_lat_long.to_csv("/Users/claramal/Desktop/Moncla_Lab/BEAST_tests/thorney-tree-model_TEST/test_5/H5N1 Thorney Beast/2022-2024_test_contphyl/2022-2024_ha_latlongs.tsv", 
                            sep = "\t")