In [1]:
import pandas as pd
import os
import re
import sys

module_dir = "/Users/monclalab1/Documents/scripts/"
sys.path.append(module_dir)

from fasta_editing import fasta_to_df, fasta_writer

In [2]:
# take in alignment files and change to be strain|date for treesort (using metadata)
# divergence trees other files will be made using snakemake

def aln_metadata_prep(list_of_genes, home_folder, align_path, 
                  metadata_path, output_dir):

    
    for path in [f"{output_dir}/alignments", f"{output_dir}/metadata"]:
            if not os.path.exists(path):
                os.makedirs(path)
            else:
                pass
    
    for gene in list_of_genes:
        
        metadata_file = f"{home_folder}/{metadata_path}/metadata_{gene}.tsv"
        

        align_file = f"{home_folder}/{align_path}/aligned_{gene}.fasta"
        output_align = f"{output_dir}/alignments/{gene}.fasta"
        
        # metadata file
        metadata = pd.read_csv(metadata_file, "\t")
        metadata["date"] = metadata["date"].str.replace('XX', '01')
        
        # alignment file 
        align = fasta_to_df(align_file)
        align.header = align.header.str.replace(">", "")

        merged = pd.merge(align, metadata[['strain', 'date']], left_on='header', right_on='strain', how='left')
        print(gene)
        
        merged["header"] = merged[['strain', 'date']].apply('|'.join, axis=1)
        print(merged.head())
        merged = merged[merged['strain'] != 'A/canine/China/18004/2019']
        merged["header"] = ">" + merged["header"]
        
        fasta_writer(f"{output_dir}/alignments/", f"h3nx_{gene}.fasta", merged)
                 
        metadata["strain"] = metadata[['strain', 'date']].apply('|'.join, axis=1)

        metadata.to_csv(f"{output_dir}/metadata/h3nx_{gene}.csv", index=False)

In [3]:
list_of_genes = ["ha", "pb2","pb1","na","np","pa","ns","mp"]
aln_metadata_prep(list_of_genes, "to-add-dates", "aln", "metadata", "all-data")

  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")


ha
                                  header  \
0   A/canine/Guangdong/1/2006|2006-06-01   
1   A/canine/Guangdong/1/2007|2007-04-18   
2   A/canine/Guangdong/2/2007|2007-10-18   
3   A/canine/Guangdong/2/2006|2006-08-01   
4  A/canine/Beijing/1028/2010|2010-02-02   

                                            sequence  \
0  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
1  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
2  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
3  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
4  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   

                       strain        date  
0   A/canine/Guangdong/1/2006  2006-06-01  
1   A/canine/Guangdong/1/2007  2007-04-18  
2   A/canine/Guangdong/2/2007  2007-10-18  
3   A/canine/Guangdong/2/2006  2006-08-01  
4  A/canine/Beijing/1028/2010  2010-02-02  
pb2
                                  header  \
0   A/canine/Guangdong/1/2006|2006-06-01   
1      A/canine/Korea/01/2007|2007-05-0

  metadata = pd.read_csv(metadata_file, "\t")
  metadata = pd.read_csv(metadata_file, "\t")


In [5]:
list_of_genes = ["ha", "pb2","pb1","na","np","pa","ns","mp"]

for gene in list_of_genes:
        
    metadata_file = f"metadata_annotated/results/metadata_annotated/h3nx_{gene}.csv"
    align_file = f"all-data/alignments/h3nx_{gene}.fasta"
    
    metadata = pd.read_csv(metadata_file)
    metadata["date"] = metadata["date"].str.replace('XX', '01')

    # alignment file 
    align = fasta_to_df(align_file)
    align.header = align.header.str.replace(">", "")
    
    
    merged = pd.merge(align, metadata[['strain', 'date', 'n_segments']], left_on='header', right_on='strain', how='left')
    
    merged = merged[merged['n_segments'] >= 8]
    
    merged["header"] = ">" + merged["header"]
    print(merged.head())

    fasta_writer(f"full_genome_data/alignments/", f"h3nx_{gene}.fasta", merged)

    metadata = metadata[metadata['n_segments'] >= 8]
    metadata.to_csv(f"full_genome_data/metadata/h3nx_{gene}.csv", index=False)
    

                                   header  \
0   >A/canine/Guangdong/1/2006|2006-06-01   
1   >A/canine/Guangdong/1/2007|2007-04-18   
2   >A/canine/Guangdong/2/2007|2007-10-18   
3   >A/canine/Guangdong/2/2006|2006-08-01   
4  >A/canine/Beijing/1028/2010|2010-02-02   

                                            sequence  \
0  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
1  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
2  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
3  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   
4  ATGAAAACTGTTATTGCTTTAAGCTACATTTTCTGCCTGGCTTTTG...   

                                  strain        date  n_segments  
0   A/canine/Guangdong/1/2006|2006-06-01  2006-06-01           8  
1   A/canine/Guangdong/1/2007|2007-04-18  2007-04-18           8  
2   A/canine/Guangdong/2/2007|2007-10-18  2007-10-18           8  
3   A/canine/Guangdong/2/2006|2006-08-01  2006-08-01           8  
4  A/canine/Beijing/1028/2010|2010-02-02  2010-02-02     