In [None]:
##the goal of this script is to pull out the names of tips for each subtree that is larger than 2 (for MCLEOD- changed to 200+
##and then making alignment fasta files of these sequences with metadata added to headers


In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO # for handling fastas
import glob  # To handle multiple files
import os

In [None]:
#read in df....(dataframe1 from the baltic- 
df = pd.read_csv('totalinfo_parsedon_mcleodstatus.csv', sep = '\t')
df

In [None]:
#remove nodes since we don't need these for alignments
df = df[df['branchType'] != 'node']
df

In [None]:
df.info()

In [None]:
##remove subtrees where the trait is non-WI bc we don't care about those
df = df[df['mcleod_status'] != 'non-MN']

# df_om = df_om[df_om['WI_status'] != 'non-Wisconsin']
df.tail(15)

In [None]:
# #create dictionary for each unique subtree for subtrees with more than one (200) leaf
# subtree_dict_om = df_om.groupby('subtree_name').filter(lambda x: len(x) > 1).groupby('subtree_name')['name'].apply(list).to_dict()
# subtree_dict_om
#create dictionary for each unique subtree for subtrees with more than 200 leaf ##change that number if you want more subtrees!!
subtree_dict_dt = df.groupby('subtree_name').filter(lambda x: len(x) > 20).groupby('subtree_name')['name'].apply(list).to_dict()
subtree_dict_dt

In [None]:
#Now reading in the single alignment file from this full build to parse out the subtree sequences
# Path to the input FASTA file


input_fasta = "aligned.fasta"
output_directory = "/Users/irelandrosegorecki/Desktop/mcleod/"


# Read all sequences from the FASTA file into a dictionary for easy access
sequences = SeqIO.to_dict(SeqIO.parse(input_fasta, "fasta"))

# Create a separate alignment file for each subtree
for subtree_name, leaf_names in subtree_dict_dt.items():
    # Extract sequences that match the names in the current subtree
    subtree_sequences = [sequences[name] for name in leaf_names if name in sequences]
    
    # Define output filename based on the subtree name
    output_file = os.path.join(output_directory, f"{subtree_name}_alignment.fasta")
    
    # Write the extracted sequences to the output file
    with open(output_file, "w") as output_handle:
        SeqIO.write(subtree_sequences, output_handle, "fasta")

    print(f"Alignment saved for {subtree_name}: {output_file}")

In [None]:
df.columns

In [None]:
# Create a dictionary for quick lookup of metadata by subtree_name
metadata_dict = df.set_index('name').to_dict(orient='index')

# Process all subtree alignment files in a folder
input_files = glob.glob('/Users/irelandrosegorecki/Desktop/mcleod/.fasta')  # Adjust path as needed

for fasta_file in input_files:
    output_file = fasta_file.replace(".fasta", "_w_meta.fasta")
    
    with open(fasta_file, 'r') as input_handle, open(output_file, 'w') as output_handle:
        for record in SeqIO.parse(input_handle, 'fasta'):
            # Find the corresponding metadata (default to 'unknown' if not found)
            metadata = metadata_dict.get(record.id, {'num_date': 'unknown', 
                                         'Urban_or_Rural_USDA_Classification': 'unknown', 
                                          'adi_staternk': 'unknown', 
                                          'age_bins': 'unknown',
                                           'vbt_vax_count': 'unknown'})
            # Update the header with all metadata fields
            record.id = f"{record.id}|{metadata['num_date']}|{metadata['Urban_or_Rural_USDA_Classification']}|{metadata['adi_staternk']}|{metadata['age_bins']}|{metadata['vbt_vax_count']}|"
            record.description = ""  # Clear description to avoid duplicating info
            
            # Write the modified record to the output file
            SeqIO.write(record, output_handle, 'fasta')
    
    print(f"Modified FASTA file saved to {output_file}")



In [None]:
# ##now subfolder with 50plus tips!
# import os
# import shutil

# # Define the source folder
# source_folder = "../../results_alignments/parsed_subtree_alignments/om/ba2/"

# # Define the list of prefixes
# prefixes = ["Wisconsin_131_", "Wisconsin_834_", "Wisconsin_225_", "Wisconsin_815_", "Wisconsin_314_"]

# # Define the new subfolder
# subfolder = os.path.join(source_folder, "50plustips")
# print(subfolder)
# os.makedirs(subfolder, exist_ok=True)  # Create the subfolder if it doesn't exist

# # Get all files in the source folder
# all_files = os.listdir(source_folder)

# # Filter and move files based on the prefixes
# for file_name in all_files:
#     # Check if the file starts with any of the prefixes
#     if any(file_name.startswith(prefix) for prefix in prefixes):
#         source_file = os.path.join(source_folder, file_name)
#         destination_file = os.path.join(subfolder, file_name)
        
#         # Move the file
#         shutil.move(source_file, destination_file)
#         print(f"Moved: {file_name}")

# print("File organization complete!")
