# Cleaning downloaded Data

Author: Alexander Maksiaev

Purpose: Clean downloaded data from GISAID

In [None]:
# Housekeeping

import os
import glob 
import pandas as pd

home = "C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/"
downloads = "C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/H5N1-NorthAmerican-gisaid_epiflu_datasets_jan1-2024_to_mar18-2025_v2/"
# downloads = home + "/GISAID_Files"
fasta_files = home + "/Fasta_Files/"

os.chdir(downloads)

# Function to convert fasta file to dataframe 
def fasta_df(file_name):

    fasta = pd.DataFrame()
    headers = []
    isolate_ids = []
    isolate_names = []
    subtypes = []
    segments = []
    collection_dates = []
    sequences = []
    with open(file_name) as f:
        lines = f.readlines()
        for num, line in enumerate(lines):
            if line[0] != ">": # If it's not a header
                if lines[num - 1].strip() not in headers: # And it's not a header we've seen before
                    header = lines[num - 1][1:].strip() # Remove the ">"
                    headers.append(header) 
                    isolate_ids.append(header.split("|")[0])
                    isolate_names.append(header.split("|")[1]) # We'll need to extract data from this too
                    subtypes.append(header.split("|")[2])
                    segments.append(header.split("|")[3])
                    if header.split("|")[4] == "2024-01-01":
                        collection_dates.append("2024") # No samples were collected 1/1/2024, these are all unknown 
                    elif header.split("|")[4] == "2025-01-01":
                        collection_dates.append("2025")
                    else: 
                        collection_dates.append(header.split("|")[4])
                    sequences.append(line.strip())
        f.close()

    # Create columns for data frame 
    fasta["Header"] = headers
    fasta["Isolate_Id"] = isolate_ids
    fasta["Isolate_Name"] = isolate_names
    fasta["Subtype"] = subtypes
    fasta["Segment"] = segments
    fasta["Collection_Date"] = collection_dates
    fasta["Sequence"] = sequences
    
    return fasta

# Function to get each unique animal listed so we can sort them

def sort_animals(fasta):
    isolate_names = fasta["Isolate_Name"]
    animal_list = []
    for name in isolate_names.values:
        animal = name.split("/")[1]
        animal_list.append(animal)

    unique_animals = list(set(animal_list))

    # Save the animals to a file so we can sort them
    return unique_animals

    # animals_df.to_csv(file_name) # We will have to sort manually, unfortunately :(

# Separate B3.13 and D1.1 in both the Excel file and the FASTA file

def separate_b313_d11_xls(metadata_raw, fasta):
    # Excel
    b313_xls = metadata_raw[metadata_raw["Genotype"] == "B3.13"]
    d11_xls = metadata_raw[metadata_raw["Genotype"] == "D1.1"]

    # print(d11_xls)

    # FASTA

    b313_mask = fasta['Isolate_Id'].isin(b313_xls['Isolate_Id'])
    d11_mask = fasta['Isolate_Id'].isin(d11_xls['Isolate_Id'])

    b313_fasta = fasta[b313_mask]
    d11_fasta = fasta[d11_mask]
    return b313_fasta, d11_fasta

# Fix animals in host type

def fix_animals(fasta, animals_ref):

    # If the animal is in a specific column of animals_ref, label host type as column name
    animal_list = [] # Find animals first
    for name in fasta["Isolate_Name"].values:
        animal = name.split("/")[1]
        animal_list.append(animal)

    animal_types = []
    for animal in animal_list: # Label each animal as a type
        # print(animal)
        if animal in animals_ref["avian"].values:
            animal_types.append("avian")
        elif animal in animals_ref["cattle"].values:
            animal_types.append("cattle")
        elif animal in animals_ref["feline"].values:
            animal_types.append("feline")
        elif animal in animals_ref["other_mammal"].values:
            animal_types.append("other_mammal")
        else: # If other
            animal_types.append("other")

    fasta["Host_Type"] = animal_types

# Separate FASTA files into 8 different files based on segment

def separate_fasta_by_seg(metadata, fasta, animals_df): #, b313_fasta, d11_fasta):

    # Dummy host type -- we'll actually add this in later
    fix_animals(fasta, animals_df)
    # b313_fasta["Host_Type"] = "other"
    # d11_fasta["Host_Type"] = "other"

    unique_segments = list(set(fasta["Segment"]))
    genotypes = ["B3.13", "D1.1"]
    # genotype_fastas = {"B3.13": b313_fasta, "D1.1": d11_fasta}
    # print(unique_segments)

    # “>Isolate_name|subtype|collection_date|host_type|genotype”

    segment_fastas = []
    for fasta_gen in genotypes: # .keys():
        for seg in unique_segments:

            xls = metadata[metadata["Genotype"] == fasta_gen]

            # print(d11_xls)

            # FASTA

            mask = fasta['Isolate_Id'].isin(xls['Isolate_Id'])

            fasta_seg_pre = fasta[mask]

            # fasta_seg = genotype_fastas[fasta_gen][genotype_fastas[fasta_gen]["Segment"] == seg]
            fasta_seg = fasta_seg_pre[fasta_seg_pre["Segment"] == seg]

            fasta_seg["Genotype"] = fasta_gen

            # Rename sequences 
            new_name = ">" + fasta_seg["Isolate_Name"] + "|" + fasta_seg["Subtype"] + "|" + fasta_seg["Collection_Date"] + "|" + fasta_seg["Host_Type"] + "|" + fasta_gen + "\n"
            fasta_seg["New_Name"] = new_name
            # print(fasta_seg["New_Name"])

            segment_fastas.append(fasta_seg)

    return segment_fastas, unique_segments


            

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'C:/Users/maksi/Documents/Statistics/Projects/H5N1-NorthAmerican-gisaid_epiflu_datasets_jan1-2024_to_mar18-2025_v2/'

In [None]:
# Run through all files -- this should be a loop

all_metadata_files = []
all_fasta_files = []

for dirpath, dirs, files in os.walk(downloads):
    for file in files:
        file_name = os.path.join(dirpath, file)
        if ".xls" in file_name:
            # print(file_name)
            metadata = pd.read_excel(file_name, engine="xlrd")
            all_metadata_files.append(metadata)
        if ".fasta" in file_name:
            fasta_file = fasta_df(file_name)
            all_fasta_files.append(fasta_file)

# print(all_metadata_files[0])

unique_animals_all = []

# print(len(all_fasta_files))

os.chdir(home)

segment_fastas = []
unique_segments = []
for i, fasta in enumerate(all_fasta_files):
    # fasta = fasta_df(f_file)
    metadata = all_metadata_files[i]

    unique_animals = sort_animals(fasta)
    unique_animals_all.append(unique_animals)

    # b313_file, d11_file = separate_b313_d11_xls(metadata, fasta)
    animals_ref = pd.read_csv("animals_ref1.csv")

    fastas, unique_segments = separate_fasta_by_seg(metadata, fasta, animals_ref) #, b313_file, d11_file)

    segment_fastas.append(fastas)

# Flatten unique_animals
every_unique_animal = []
for l in unique_animals_all:
    for animal in l:
        every_unique_animal.append(animal)



FileNotFoundError: [Errno 2] No such file or directory: 'animals_ref1.csv'

In [None]:
print(len(segment_fastas))

print(every_unique_animal)

7
['geoffroys_cat', 'pheasant', 'owl', 'hawk', 'cat', 'swine', 'guineafowl', 'lion', 'turkey', 'mountain_lion', 'environment', 'ibis', 'savannah_cat', 'fox', 'lynx', 'goose', 'sanderling', 'chicken', 'serval', 'common_grackle', 'goat', 'tiger', 'western_gull', 'bobcat', 'duck', 'dairy_cow', 'pigeon', 'bottlenose_dolphin', 'ermine', 'peafowl', 'mute_swan', 'cackling_goose', 'barn_owl', 'pheasant', 'hawk', 'wood_duck', 'crane', 'cat', 'black-crowned_night-heron', 'guineafowl', 'american_crow', 'flamingo', 'hooded_merganser', 'trumpeter_swan', 'turkey', 'canada_goose', 'great_horned_owl', 'snow_goose', 'crested_caracara', 'environment', 'cougar', 'quail', 'common_raven', 'merganser', 'green-winged_teal', 'sandhill_crane', 'ruddy_duck', 'black_vulture', 'red-breasted_merganser', 'tundra_swan', 'mallard', 'bald_eagle', 'goose', 'great_blue_heron', 'red-tailed_hawk', 'burrowing_owl', 'red-shouldered_hawk', 'chicken', 'harbor_seal', 'gull', 'falcon', 'serval', 'turkey_vulture', 'american_wige

In [None]:
# Merge them all together into 16 files -- 8 segments x 2 genotypes

# 16 files needed
huge_fasta = pd.DataFrame()

for fastas in segment_fastas: # 7 batches
    # print(len(fastas))
    # break
    for f in fastas: # 16 files per batch 
        # print(f)
        # break 
        huge_fasta = pd.concat([huge_fasta, f])

# print(huge_fasta.columns)

# Now separate huge_fasta into 16 fastas
big_fastas = []

genotypes = ["B3.13", "D1.1"]
for gen in genotypes:
    big_fasta = huge_fasta[huge_fasta["Genotype"] == gen]
    for seg in unique_segments:
        seg_specific_fasta = big_fasta[big_fasta["Segment"] == seg]
        big_fastas.append(seg_specific_fasta)

# Now that we have 16 fastas, write the files
for fasta in big_fastas:

    # Create a dictionary to create a file
    fasta_df = fasta[["New_Name", "Sequence"]]
    fasta_dict = pd.Series(fasta_df.Sequence.values,index=fasta_df.New_Name).to_dict()

    # Create fasta file 
    output_path = fasta_files + fasta["Genotype"].values[0] + "_" + fasta["Segment"].values[0] + ".fasta" # Genotype and Segment should all be the same
    output_file = open(output_path, "w")
    for item in fasta_dict.keys():
        value = fasta_dict[item] + "\n"
        # print(value)
        output_file.write(item)
        output_file.write(value)
    output_file.close()

In [None]:
# Rename host type

unique_animals_set = list(set(every_unique_animal))
animals_df = pd.DataFrame(columns=["avian", "cattle", "feline", "other_mammal", "human", "other"])
animals_df["other"] = unique_animals_set # to sort

os.chdir(home)
animals_df.to_csv("animals_ref2.csv")