In [27]:
import pandas as pd
import os

In [28]:
#this converts fasta files into dataframes
def fasta_to_df(fasta_file):
    
    fasta_data = []
    
    with open(fasta_file) as f:
        header = ""
        sequence = ""
        for line in f:
            if line.startswith(">"):
                if header != "":
                    fasta_data.append({"header": header, "sequence": sequence})
                header = line.strip() 
                sequence = ""
            else:
                sequence += line.strip()
        fasta_data.append({"header": header, "sequence": sequence}) #last line
            
    return pd.DataFrame(fasta_data)

In [29]:
def fasta_writer(path, filename, df):
            
    try:  
        os.mkdir(path)

    except OSError as error:
        pass

    with open(f"{path}{filename}", "w") as f:
        for index, row in df.iterrows():
            f.write(f"{row['header']}\n")
            f.write(f"{row['sequence']}\n")

In [35]:
list_of_genes = ["ha", "pb1", "pb2","pa","mp","np","na","ns"]

hosts = ["allSwine",
         "swineEuro",
         "swineNA",
         "allAvian",
         "avianNA",
         "avianEurasia",
         "canineH3N8",
         "equine",
         "human",
         "canineH3N2"
        ]

In [33]:
def alignments(hosts: list, list_of_genes: list):

    for host in hosts:

        for gene in list_of_genes:

            alignment = f"./alignments/pruned_alignments_h3nx_{gene}.fasta"

            align = fasta_to_df(alignment)
            align['header_stripped'] = align['header'].str.lstrip('>')

            leaves_file = f"./alignments/leaves/{host}/{host}_leaves_{gene}.tsv"
            leaves_df = pd.read_csv(leaves_file, header=None, names=["strain"])

            align_filtered = align[align['header_stripped'].isin(leaves_df['strain'])].copy()

            align_filtered['header'] = '>' + align_filtered['header_stripped']

            align_filtered = align_filtered.drop(columns=['header_stripped'])

            print(host, align_filtered.shape) 

            fasta_writer(f"./{host}/", f"aligned_h3nx_{gene}.fasta", align_filtered)
                   

In [34]:
alignments(hosts, list_of_genes)

allSwine (1902, 2)
allSwine (1940, 2)
allSwine (1931, 2)
allSwine (1925, 2)
allSwine (1931, 2)
allSwine (1933, 2)
allSwine (1912, 2)
allSwine (1938, 2)
swineEuro (349, 2)
swineEuro (363, 2)
swineEuro (361, 2)
swineEuro (361, 2)
swineEuro (379, 2)
swineEuro (362, 2)
swineEuro (350, 2)
swineEuro (362, 2)
swineNA (646, 2)
swineNA (840, 2)
swineNA (838, 2)
swineNA (841, 2)
swineNA (253, 2)
swineNA (841, 2)
swineNA (716, 2)
swineNA (838, 2)
allAvian (2310, 2)
allAvian (2317, 2)
allAvian (2317, 2)
allAvian (2317, 2)
allAvian (2318, 2)
allAvian (2318, 2)
allAvian (2318, 2)
allAvian (2316, 2)
avianNA (1205, 2)
avianNA (1340, 2)
avianNA (1341, 2)
avianNA (1336, 2)
avianNA (1345, 2)
avianNA (1273, 2)
avianNA (861, 2)
avianNA (505, 2)
avianEurasia (861, 2)
avianEurasia (854, 2)
avianEurasia (858, 2)
avianEurasia (849, 2)
avianEurasia (855, 2)
avianEurasia (863, 2)
avianEurasia (525, 2)
avianEurasia (639, 2)
canineH3N8 (46, 2)
canineH3N8 (46, 2)
canineH3N8 (46, 2)
canineH3N8 (46, 2)
canineH3N8 (45