In [1]:
import pandas as pd
import os
import re
import sys

module_dir = "/Users/monclalab1/Documents/scripts/"
sys.path.append(module_dir)

from fasta_editing import fasta_to_df, fasta_writer

In [5]:
# take in alignment files and change to be strain|date for treesort (using metadata)
# divergence trees other files will be made using snakemake

def aln_metadata_prep(list_of_genes, home_folder, align_path, 
                  metadata_path, output_dir):

    
    for path in [f"{output_dir}/alignments", f"{output_dir}/metadata"]:
            if not os.path.exists(path):
                os.makedirs(path)
            else:
                pass
            
    strains = pd.read_csv(f"{home_folder}/strains.tsv", "\t")
    strains_set = set(strains["strain"]) 
    
    print(strains_set)
    
    for gene in list_of_genes:
        
        metadata_file = f"{home_folder}/{metadata_path}/metadata_h3nx_{gene}.tsv"

        align_file = f"{home_folder}/{align_path}/h3nx_{gene}.fasta"
        output_align = f"{output_dir}/alignments/h3nx_{gene}.fasta"
        
        # metadata file
        metadata = pd.read_csv(metadata_file, "\t")
        metadata["date"] = metadata["date"].str.replace('XX', '01')
        
        # alignment file 
        align = fasta_to_df(align_file)
        align.header = align.header.str.replace(">", "")
        
        align = align[align["header"].isin(strains_set)]

        merged = pd.merge(align, metadata[['strain', 'date']], left_on='header', right_on='strain', how='left')
        
        merged["header"] = merged[['strain', 'date']].apply('|'.join, axis=1)
        merged["header"] = ">" + merged["header"]
        
        fasta_writer(f"{output_dir}/alignments/", f"h3nx_{gene}.fasta", merged)
                 
        metadata["strain"] = metadata[['strain', 'date']].apply('|'.join, axis=1)

        metadata.to_csv(f"{output_dir}/metadata/h3nx_{gene}.csv", index=False)

In [6]:
list_of_genes = ["ha", "pb2","pb1","na","np","pa","ns","mp"]
aln_metadata_prep(list_of_genes, "preprepped", "aln", "meta", ".")

  strains = pd.read_csv(f"{home_folder}/strains.tsv", "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")


{'A/swine/Warendorf/6727/2007', 'A/swine/Indiana/21TOSU1301/2021', 'A/swine/Hong_Kong/NS2600/2014', 'A/swine/Minnesota/MT_12_07_518/2012', 'A/swine/Heilongjiang/1/05', 'A/swine/Iowa/A01432395/2013', 'A/swine/Thailand/PB483/2009', 'A/swine/Borken/21121/2015', 'A/swine/Indiana/18TOSU2186/2018', 'A/swine/Manitoba/01179/2006', 'A/swine/Chachoengsao/NIAH117865-035/2017', 'A/swine/Mexico/9783514/2013', 'A/swine/Italy/240849/2015', 'A/swine/North_Carolina/A02142700/2018', 'A/swine/Manitoba/D0255/2013', 'A/swine/Kagoshima/37-7194/2019', 'A/swine/Manitoba/G5/2014', 'A/swine/Louisiana/A02525266/2021', 'A/swine/Hanoi/8_745/2017', 'A/swine/North_Carolina/01216/2006', 'A/swine/Holtrup/6358/2007', 'A/swine/Lohne/17104/2013', 'A/swine/Ontario/G10/2014', 'A/swine/Minnesota/MT_13_01_S88/2013', 'A/swine/China/ZQ82/2018', 'A/swine/British_Columbia/28103/2005', 'A/swine/Manitoba/D0354/2014', 'A/swine/Stresow/7919a/2008', 'A/swine/Bad_Wuennenberg/7188/2008', 'A/swine/Manitoba/D0459/2016', 'A/swine/Manitoba

  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
