# Cleaning downloaded Data

Author: Alexander Maksiaev

Purpose: Clean downloaded data from GISAID

In [46]:
# Housekeeping

import os
import pandas as pd

# Note: Make sure that the files are in the same directory as this script 

metadata = pd.read_excel("H5N1-NorthAmerica-gisaid_epiflu_isolates_metadata_jan1_2024.xls")

fasta = pd.DataFrame()
headers = []
isolate_ids = []
isolate_names = []
subtypes = []
segments = []
collection_dates = []
sequences = []
with open("H5N1-NorthAmerica-gisaid_epiflu_sequences_jan1_2024.fasta") as f:
    lines = f.readlines()
    for num, line in enumerate(lines):
        if line[0] != ">": # If it's not a header
            if lines[num - 1].strip() not in headers: # And it's not a header we've seen before
                header = lines[num - 1][1:].strip()
                headers.append(header) # Remove the ">"
                isolate_ids.append(header.split("|")[0])
                isolate_names.append(header.split("|")[1])
                subtypes.append(header.split("|")[2])
                segments.append(header.split("|")[3])
                collection_dates.append(header.split("|")[4])
                sequences.append(line.strip())
    f.close()

fasta["Header"] = headers
fasta["Isolate_Id"] = isolate_ids
fasta["Isolate_Name"] = isolate_names
fasta["Subtype"] = subtypes
fasta["Segment"] = segments
fasta["Collection_Date"] = collection_dates
fasta["Sequence"] = sequences

# print(fasta.head())
print(fasta)

# Isolate_ID, Isolate_Name, Subtype, Segment, Collection_Date

                                                 Header        Isolate_Id  \
0     EPI_ISL_19689772|A/dairy_cow/USA/035992-001/20...  EPI_ISL_19689772   
1     EPI_ISL_19689772|A/dairy_cow/USA/035992-001/20...  EPI_ISL_19689772   
2     EPI_ISL_19689772|A/dairy_cow/USA/035992-001/20...  EPI_ISL_19689772   
3     EPI_ISL_19689772|A/dairy_cow/USA/035992-001/20...  EPI_ISL_19689772   
4     EPI_ISL_19689772|A/dairy_cow/USA/035992-001/20...  EPI_ISL_19689772   
...                                                 ...               ...   
9223  EPI_ISL_19289799|A/dairy_cow/USA/24_016568-001...  EPI_ISL_19289799   
9224  EPI_ISL_19289799|A/dairy_cow/USA/24_016568-001...  EPI_ISL_19289799   
9225  EPI_ISL_19289799|A/dairy_cow/USA/24_016568-001...  EPI_ISL_19289799   
9226  EPI_ISL_19289799|A/dairy_cow/USA/24_016568-001...  EPI_ISL_19289799   
9227  EPI_ISL_19289799|A/dairy_cow/USA/24_016568-001...  EPI_ISL_19289799   

                            Isolate_Name   Subtype Segment Collection_Date 

In [76]:
# Separate B3.13 and D1.1 in both the Excel file and the FASTA file

# Excel
b313_xls = metadata[metadata["Genotype"] == "B3.13"]
d11_xls = metadata[metadata["Genotype"] == "D1.1"]

# print(d11_xls)

# FASTA

b313_mask = fasta['Isolate_Id'].isin(b313_xls['Isolate_Id'])
d11_mask = fasta['Isolate_Id'].isin(d11_xls['Isolate_Id'])

b313_fasta = fasta[b313_mask]
d11_fasta = fasta[d11_mask]


In [79]:
# Separate FASTA files into 8 different files based on segment

unique_segments = list(set(fasta["Segment"]))
genotype_fastas = {"B3.13": b313_fasta, "D1.1": d11_fasta}
print(unique_segments)

# “>Isolate_name|subtype|collection_date|host_type|genotype”

segment_fastas = []
for fasta_gen in genotype_fastas.keys():
    for seg in unique_segments:
        fasta_seg = genotype_fastas[fasta_gen][genotype_fastas[fasta_gen]["Segment"] == seg]
        # Dummy variable for host_type for now
        fasta_seg["Host_Type"] = "other"
        # Rename sequences 
        new_name = ">" + fasta_seg["Isolate_Name"] + "|" + fasta_seg["Subtype"] + "|" + fasta_seg["Collection_Date"] + "|" + fasta_seg["Host_Type"] + "|" + fasta_gen + "\n"
        fasta_seg["New_Name"] = new_name
        # print(fasta_seg["New_Name"])

        # Create a dictionary to create a file
        fasta_df = fasta_seg[["New_Name", "Sequence"]]
        fasta_dict = fasta_df.to_dict()

        # print(fasta_dict[new_name.iloc[0]])
        # fasta_dict[new_name] = fasta_seg["Sequence"].iloc[0] + "\n"

        output_path = "Example_Files/" + fasta_gen + "_" + seg + ".txt"
        output_file = open(output_path, "w")
        for item in fasta_dict.keys():
            value = fasta_dict[item]
            print(value)
            output_file.write(item)
            output_file.write(value)
        output_file.close()

        

['PB2', 'PB1', 'NA', 'NP', 'NS', 'HA', 'PA', 'MP']
{6: '>A/dairy_cow/USA/035992-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 15: '>A/chicken/USA/038428-002/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 22: '>A/dairy_cow/USA/035208-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 30: '>A/dairy_cow/USA/035991-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 37: '>A/chicken/USA/038428-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 46: '>A/dairy_cow/USA/035604-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 54: '>A/dairy_cow/USA/038356-002/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 62: '>A/chicken/USA/035503-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 70: '>A/dairy_cow/USA/035990-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 79: '>A/dairy_cow/USA/038355-001/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 86: '>A/chicken/USA/035506-002/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 94: '>A/dairy_cow/USA/035999-002/2024|A_/_H5N1|2024-01-01|other|B3.13\n', 102: '>A/chicken/USA/038444-003/2024|A_/_H5N1|2024-01-01|other|B3.13\

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fasta_seg["Host_Type"] = "other"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fasta_seg["New_Name"] = new_name


TypeError: write() argument must be str, not dict