# Cleaning downloaded Data from avian-flu GitHub

Author: Alexander Maksiaev

Purpose: Clean data from avian-flu GitHub

In [1]:
# Housekeeping

import os
import glob 
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import importlib
import utils  
importlib.reload(utils)
from utils import * 

home = "C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/"
downloads = "C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/"
# downloads = home + "/GISAID_Files"
temp_files = home + "avian-influenza_Temp_Files/"
complete_files = home + "avian-influenza_Complete_Files/"

os.chdir(downloads)

In [2]:
# Read metadata

metadata_folder = downloads + "metadata/"
os.chdir(metadata_folder)

metadata = pd.read_csv("SraRunTable_automated.csv")

# Naming convention
# A/[host]/[geo_loc_name]/[isolate]/[year]/A_/_H5N1/[year]/[host_type]/[genotype: B3.13 or D1.1]
# host_type is from manual animal reference
# In metadata, we have: host, geo_loc_name, isolate, year
# We need: host_type, genotype





In [3]:
# Read FASTA files

fasta_folder = downloads + "fasta/"

# Find only >= 2024 using run ID from metadata
metadata["Collection_Date"] = pd.to_numeric(metadata["Collection_Date"], errors='coerce')
metadata_new = metadata[metadata["Collection_Date"] >= 2024]
metadata_new["Collection_Date"] = metadata_new["Collection_Date"].astype(int) # Years are not floats

# print(metadata_new)


# Create dictionary so we can make fasta files with 8 segments
file_all_segs = {}


# print(file_all_segs)
     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_new["Collection_Date"] = metadata_new["Collection_Date"].astype(int) # Years are not floats


In [None]:
# Now, use run ID to find only those fasta files

for dirpath, dirs, files in os.walk(fasta_folder):
    for n, file in enumerate(files):
        file_name = os.path.join(dirpath, file)
        run = file_name.split("_")[-3].split("/")[-1]
        if run in metadata_new["Run"].values: # Find by Run
            # print(file_name)
            # break 

            # Make dataframe holding 8 segments per 1 run
            with open(file_name) as f: # Open fasta file
                lines = f.readlines()
                if run not in file_all_segs.keys(): # If we haven't seen this run yet
                    file_all_segs[run] = []
                file_all_segs[run].append(lines) # append lines
                # file_all_segs[run].append(file_name.split("_")[-2]) # append segment
                
                print(n, " ", file_name)

print(file_all_segs["SRR32654216"]) # Each value is a list of list of lines 

10520   C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/fasta/SRR28752446_HA_cns.fa
10521   C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/fasta/SRR28752446_MP_cns.fa
10522   C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/fasta/SRR28752446_NA_cns.fa
10523   C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/fasta/SRR28752446_NP_cns.fa
10524   C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/fasta/SRR28752446_NS_cns.fa
10525   C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/fasta/SRR28752446_PA_cns.fa
10526   C:/Users/maksi/Documents/Statistics/Projects/Avian_Flu_Files/avian-influenza_Files/avian-influenza/fasta/SRR28752446_PB1_cns.fa
10527   C:/Users/maksi/Documents/Statistics/Projects/A

In [13]:
# Create temporary files for GenoFlu
for run in file_all_segs.keys():

    output_path = temp_files + run + ".fasta" # Genotype and Segment should all be the same
    output_file = open(output_path, "w")

    for lines in file_all_segs[run]:
        # print(lines)
        for line in lines:

        # Remove newline and then add it back in again...
        # line = line[:-2]
            output_file.write(line)
    
    output_file.close()