# To perform MACSE alignment

In [None]:
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor

def run_macse(input_path, output_path, java_path, macse_jar_path):
    # Run MACSE using subprocess
    macse_cmd = [
        java_path,
        '-jar', macse_jar_path,
        '-prog', 'alignSequences',
        '-seq', input_path,
        '-out_NT', output_path,
    ]

    try:
        result = subprocess.run(macse_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True)
        print(result.stdout)
        print(result.stderr)

        print(f"Alignment for {input_path} completed. Output saved to {output_path}")
    except subprocess.CalledProcessError as e:
        print(f"Error during alignment for {input_path}. Details:\n{e}")

def convert_to_phylip(input_path, output_path, java_path, macse_jar_path):
    # Run MACSE AlignmentConverter to convert to PHYLIP format
    converter_cmd = [
        java_path,
        '-jar', macse_jar_path,
        '-prog', 'AlignmentConverter',
        '-in', input_path,
        '-out_PHYLIP', output_path,
    ]

    subprocess.run(converter_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    print(f"Conversion of {input_path} to PHYLIP format completed. Output saved to {output_path}")

def align_and_convert_parallel(orthogroup_folder, output_folder, num_threads, java_path, macse_jar_path):
    # Get a list of input FASTA files (one per orthogroup)
    input_files = [f for f in os.listdir(orthogroup_folder) if f.endswith('.fasta')]
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Generate a list of (input_path, output_path) tuples
        input_output_paths = [
            (os.path.join(orthogroup_folder, input_file), os.path.join(output_folder, input_file.replace('.fasta', '_aligned.fasta')))
            for input_file in input_files
        ]

        # Submit tasks to align sequences
        align_futures = [executor.submit(run_macse, input_path, output_path, java_path, macse_jar_path) for input_path, output_path in input_output_paths]

        # Wait for all alignment tasks to complete
        for future in align_futures:
            future.result()

        # Generate a list of (aligned_input_path, phylip_output_path) tuples
        aligned_phylip_paths = [
            (output_path, os.path.join(output_folder, os.path.basename(output_path).replace('_aligned.fasta', '_aligned.phylip')))
            for _, output_path in input_output_paths
        ]

        # Submit tasks to convert to PHYLIP format
        convert_futures = [executor.submit(convert_to_phylip, input_path, output_path, java_path, macse_jar_path) for input_path, output_path in aligned_phylip_paths]

        # Wait for all conversion tasks to complete
        for future in convert_futures:
            future.result()

# Specify the paths and parameters
orthogroup_folder = '/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/fasta_files'
output_folder = '/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/macse_realignment_files'
num_threads = 4  # Adjust the number of threads based on your system's capabilities

# Specify the full path to the MACSE JAR file
macse_jar_path = '/usr2/people/shollyt22/macse_v2.07.jar'

# Obtain the Java path
java_path_result = subprocess.run(['which', 'java'], capture_output=True, text=True)
java_path = java_path_result.stdout.strip()

# Run the alignment script
align_and_convert_parallel(orthogroup_folder, output_folder, num_threads, java_path, macse_jar_path)


file : /usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/fasta_files/OG0003060.fasta
31	sequences with genetic code	The_Standard_Code
compute initial pairwise distances
...............................
compute first alignment with guide tree
++++++++++++++++++++++++++++++
first alignment score : 1356060.1

start refining the alignment
.-......................................+...................
.-......................................-------------------.
The file '/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/macse_realignment_files/OG0003060_aligned.fasta' was created.
The file '/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/fasta_files/OG0003060_AA.fasta' was created.
PROGRAM HAS FINISHED SUCCESSFULLY


Alignment for /usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG

## To convert the MACSE alignment files to philip format

In [2]:
from Bio import AlignIO
import os

def convert_fasta_to_phylip(input_path, output_path):
    alignment = AlignIO.read(input_path, "fasta")
    AlignIO.write(alignment, output_path, "phylip-relaxed")

def batch_convert_orthogroups(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Loop through each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith("_aligned.fasta"):  # Assuming your MACSE files have this naming pattern
            input_path = os.path.join(input_folder, filename)
            
            # Create the output filename by replacing "_aligned.fasta" with "_aligned.phylip"
            output_filename = filename.replace("_aligned.fasta", "_aligned.phylip")
            output_path = os.path.join(output_folder, output_filename)
            
            # Convert the orthogroup to PHYLIP format
            convert_fasta_to_phylip(input_path, output_path)

# Example usage
orthogroup_folder = "/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/macse_realignment_files"
phylip_conversion_folder = "/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/macse_realignment_files/phylip_conversion"

batch_convert_orthogroups(orthogroup_folder, phylip_conversion_folder)


### to generate output in sequential format and not interleaved

In [3]:
from Bio import AlignIO
import os

def convert_fasta_to_phylip(input_path, output_path):
    alignment = AlignIO.read(input_path, "fasta")
    AlignIO.write(alignment, output_path, "phylip-sequential")

def batch_convert_orthogroups(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Loop through each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith("_aligned.fasta"):  # Assuming your MACSE files have this naming pattern
            input_path = os.path.join(input_folder, filename)
            
            # Create the output filename by replacing "_aligned.fasta" with "_aligned.phylip"
            output_filename = filename.replace("_aligned.fasta", "_aligned.phylip")
            output_path = os.path.join(output_folder, output_filename)
            
            # Convert the orthogroup to sequential PHYLIP format
            convert_fasta_to_phylip(input_path, output_path)

# Example usage
orthogroup_folder = "/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/macse_realignment_files"
phylip_conversion_folder = "/usr2/people/shollyt22/shollyt22/JGIsordariomycete/revised_OG_31_spp/written_OG_nucl/Filtered_OG_divisible_by_3/macse_realignment_files/phylip_conversion_sequential"

batch_convert_orthogroups(orthogroup_folder, phylip_conversion_folder)
