# Let's Parse our Mummer Alignments! 

In [5]:
# Import some tools
import os
import csv
import glob
from Bio import SeqIO
from collections import defaultdict
# CLI stuff for MP
import argparse
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

## Define our main functions

In [None]:
def parse_genbank(path):
    records = defaultdict()
    for record in SeqIO.parse(path, 'genbank'):
        records[record.id] = record
    return records

def get_alignment_files(alignments_dir):
    alignments = glob.glob(f'{alignments_dir}/**/align_coords.tsv', recursive=True)
    return alignments

def check_alignment(path):
    # Let's make sure there's actually alignments within this file. Many such cases of no alignment. (Expected)
    # {{TO-DO: Add way to gather no-alns into a table}}
    with open(path, 'r') as infile:
        lines = infile.readlines()
    if len(lines) == 1:
        return False
    else:
        return True

def parse_alignment(path):
    # Read the file and parse the table.
    with open(path, 'r') as infile:
        lines = infile.readlines()
    keys = lines[0].strip().split('\t')
    for line in lines[1::]:
        values = line.strip().split('\t')
        alignments.append({key:value for key,value in zip(keys, values)}) #zippity split
    return alignments

def is_within_range(feature, start, end):
    #print(feature)
    start_check = int(feature.location.start) >= start
    end_check = int(feature.location.end) <= end
    checked = (start_check and end_check) # gotta be within the range! {{TO-DO: Implement partial gene hit identification}}
    return checked

def get_features_from_range(record, start, end):
    # gonna pull the whole feature
    features = [feature for feature in record.features if is_within_range(feature, start, end) and feature.type == 'CDS']
    return features

def simplify_genes_for_contig(features):
    genes = []
    for feature in features:
        locus_tag = feature.qualifiers['locus_tag'][0].strip("'").strip('[').strip(']') # thanks python.
        product = ' '.join(feature.qualifiers['product'])
        gene = (locus_tag, product)
        genes.append(gene)
    return genes

def get_coverage(length, coords):
    covered_positions = set()
    for position in coords:
        start = position[0]
        end = position[1]
        if start > end:  # Note, this should only be required for calculation of coverage, feature extraction shouldn't need it.
            start, end = end, start
        covered_positions.update(range(start, end+1))
    percent_coverage = (len(covered_positions)/length)*100
    return percent_coverage

def get_homologies(alignments, ref_dict, asm_dict):
    # get_genes_from_alignments() => get_homologies()
    genes_dict = defaultdict() # set up dict for alignments
    for alignment in alignments: # iterate through each alignment for this particular comparison.
        # for whatever reason, mummer indicates the reference as the query and the assembly as the ref. Whatever. just be mindful.
        ref_id            = alignment['QUERY_ID']
        ref_name          = alignment['QUERY_NAME']
        ref_start         = int(alignment['QUERY_START'])
        ref_end           = int(alignment['QUERY_END'])
        ref_aln_length    = int(alignment['QUERY_LENGTH']) # ALIGNED LENGTH)
        ref_length        = len(ref_dict[ref_name].seq)
        assembly_id       = alignment['REF_ID']
        contig_name       = alignment['REF_NAME']#.replace("0000", "_").replace("_0", "_")
        contig_start      = int(alignment['REF_START'])
        contig_end        = int(alignment['REF_END'])
        contig_aln_length = int(alignment['REF_LENGTH']) # ALIGNED LENGTH)
        aln_identity      = alignment['IDENTITY']
        contig_length     = len(asm_dict[contig_name].seq)
        ref_features      = get_features_from_range(ref_dict[ref_name], ref_start, ref_end)
        asm_features      = get_features_from_range(asm_dict[contig_name], contig_start, contig_end)
        ref_genes         = simplify_genes_for_contig(ref_features)
        asm_genes         = simplify_genes_for_contig(asm_features)
        #asm_genes = "placeholder :)"
        # Get percent coverage for each alignment
        # UPDATE DON'T DO THAT HERE THE RANGES GET ALL WEIRD.
        #ref_coverage = get_coverage(ref_length, ref_start, ref_end)
        #contig_coverage = get_coverage(contig_length, contig_start, contig_end)

        if contig_name not in genes_dict: # if the contig is not already in the dict, add it, also add contig_len to its own key.
            genes_dict[contig_name] = defaultdict(dict)
            genes_dict[contig_name]['contig_length'] = int(contig_length) # Honestly for simplicity I should prob just make a separate dict for this.

        if ref_name not in genes_dict[contig_name]: # if the ref is not in the contig dict, add it and set val to an empty list.
            genes_dict[contig_name][ref_name] = []

        alignment_dict = {
                        'ref_aln': {
                                'ref_length': int(ref_length),
                                'start': int(ref_start),
                                'end': int(ref_end),
                                'aln_length': ref_aln_length,
                                'percent_cov': int(),
                                'features': ref_genes,
                            },
                        'asm_aln': {
                                'start': int(contig_start),
                                'end': int(contig_end),
                                'aln_length': int(contig_aln_length),
                                'aln_identity': aln_identity, # to the reference, this will probably get confusing downstream :)
                                'percent_cov': int(),
                                'features': asm_genes,
                            },
                    } # def the dictionary to append to the particular ref alignment for this single contig.

        genes_dict[contig_name][ref_name].append(alignment_dict)
    return genes_dict

def make_table_for_asm(genes_dict, output_path):
    lines = []
    header_row = 'contig_id\tcontig_len\tref\tref_len\tcontig_cov\treference_cov\tgenes_on_ref\tgenes_on_contig\n'
    with open(output_path, 'w') as outfile:
        lines.append(header_row)
        for contig in genes_dict:
            ref_genes = []
            contig_genes = []
            for aln in genes_dict[contig].keys():
                ref_region_genes = []
                asm_region_genes = []
                if aln != 'contig_length': # see above to-do re: separate dict just for len/assembly stats.
                    contig_len = genes_dict[contig]['contig_length']
                    aln_coords = [(cov['asm_aln']['start'],cov['asm_aln']['end']) for cov in genes_dict[contig][aln]]
                    ref_aln_coords = [(cov['ref_aln']['start'],cov['ref_aln']['end']) for cov in genes_dict[contig][aln]]
                    ref_len = genes_dict[contig][aln][0]['ref_aln']['ref_length']
                    for index, item in enumerate(genes_dict[contig][aln]):
                        ref_region_genes = [gene[0] for gene in item['ref_aln']['features']]
                        ref_genes.extend(ref_region_genes) 
                        contig_region_genes = [gene[0] for gene in item['asm_aln']['features']]
                        contig_genes.extend(contig_region_genes)
                    contig_cov = get_coverage(contig_len, aln_coords)
                    #print(contig_cov, aln_coords)
                    reference_cov = get_coverage(ref_len, ref_aln_coords)
                    output_row = f'{contig}\t{contig_len}\t{aln}\t{ref_len}\t{contig_cov:.2f}\t{reference_cov:.2f}\t{len(ref_genes)}\t{len(contig_genes)}\n'
                    lines.append(output_row)
        outfile.writelines(lines)

def get_rows(assembly_id, alignments, genes_dict):
    """ Take the alns and format rows for dumping to a table separate from the main output"""
    rows = []
    header = ['assembly_id','contig','contig_len','total_contig_coverage','list_of_alignments(ref:contig_cov:location)']
    rows.append(header)
    for contig in genes_dict:
        contig_list = []
        contig_len = genes_dict[contig]['contig_length']
        total_contig_cov = 0
        for aln in genes_dict[contig].keys():
            if aln != 'contig_length':
                asm_coords = [(cov['asm_aln']['start'],cov['asm_aln']['end']) for cov in genes_dict[contig][aln]]
                ref_coords = [(cov['ref_aln']['start'],cov['ref_aln']['end']) for cov in genes_dict[contig][aln]]
                ref_len = genes_dict[contig][aln][0]['ref_aln']['ref_length']
                contig_cov = get_coverage(contig_len, asm_coords)
                #collapsed_coords = collapse_coords(asm_coords)
                #asm_location = check_location_of_alignment(contig_len, asm_coords)
                #print(check_location_of_alignment(contig_len, collapse_coords(asm_coords)))
                ref_cov = get_coverage(ref_len, ref_coords)
                ref_aln = (f'{aln}: {contig_cov:.2f}%')
                contig_list.append((ref_aln))
                total_contig_cov += contig_cov
        row = [assembly_id, contig,contig_len,f'{total_contig_cov:.2f}']
        row.extend(contig_list)
        rows.append(row)
    return rows

def write_rows(rows, output_file):
    with open(output_file, 'w') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerows(rows)

def parse_b31_alns(alignment, annotations_dir, ref_dict, output_dir):
    sample_id = alignment.split('/')[-2]
    #print(f'parsing alignments for {sample_id}!')
    assembly = f'{annotations_dir}/{sample_id}/{sample_id}.gbff'
    #print(f'gbff for {sample_id} exists: {os.path.exists(assembly)}')
    #print(assembly)
    asm_dict = parse_genbank(assembly)
    alignments = parse_alignment(alignment)
    homologies = get_homologies(alignments, ref_dict, asm_dict)
    output_file_detailed = f'{output_dir}/detailed_coverage/{sample_id}_B31_Synteny.tsv'
    make_table_for_asm(homologies, output_file_detailed')
    rows = get_rows(sample_id, alignments, homologies)
    output_file_simple = f'{output_dir}/simple_coverage/{sample_id}_coverage.tsv'
    write_rows(rows, output_file_simple)

def run_all_b31_alns(alignments_dir, annotations_dir, reference_genbank, output_dir):
    ref_dict = parse_genbank(reference_genbank)
    alignment_files = glob.glob(f'{alignments_dir}/*/align_coords.tsv')
    for file in alignment_files:
        parse_b31_alns(alignment, annotations_dir, ref_dict, output_dir)
        #print(f'Finished! moving on')

def parse_ids_from_filename(alignment):
    alignment_dir_name = os.path.dirname(alignment).split('/')[-1]
    asm1_id = os.path.basename(alignment_dir_name).split('_vs_')[0]
    asm2_id = os.path.basename(alignment_dir_name).split('_vs_')[1]
    return asm1_id, asm2_id

def parse_single_pair_aln(alignment, annotations_dir):
    # {{TO-DO: Not at this point but at some point I need to pull in the plasmid ID to name mapping dict.}}
    
    ## okay let's get our ids.
    asm1_id, asm2_id = parse_ids_from_filename(alignment)
    
    ## First let's check to see that there are actually alignments for this pair.
    completion_msg = f'Parsed homology between {asm1_id} and {asm2_id}!'
    if not check_alignment(alignment):
        completion_msg = f'NO HOMOLOGY BETWEEN {asm1_id} and {asm2_id}!'
        return (False, completion_msg, [asm1_id, asm2_id]) # return the two ids if no homology!
        
    ### ok, it's not empty, let's parse this out.
    alignments = parse_alignment(alignment)
    
    ## now let's parse the genbanks
    asm1_gb = f'{annotations_dir}/{asm1_id}.gbff'
    asm2_gb = f'{annotations_dir}/{asm2_id}.gbff'
    asm1_dict = parse_genbank(asm1_gb)
    asm2_dict = parse_genbank(asm2_gb)
    
    # Ok so we aren't dumping an individual file for each of these, we're taking the rows and catting them into 
    # a big list of rows for a single table to be able to elucidate all of the homologies for each plasmid.

    ## Anyway let's get our genes for these alignments and actually *parse* the alignments.
    homologies = get_homologies(alignments, asm1_dict, asm2_dict)

    ## Now let's make our rows.
    rows = get_rows(asm1_id, alignments, homologies)
    completion_msg = f'Parsed homology between {asm1_id} and {asm2_id}!'
    return (True, completion_msg, rows)

def parallel_parse(cpus, alignment_files, annotations_dir):
    # def parse_all_v_all():
    # this may require parallelization?
    # yeah let's just go ahead and do that.
    all_rows = []
    no_homology = []
    with ProcessPoolExecutor(max_workers=cpus) as executor: # need to specify this elsewhere.
        futures = []
        # okay so we need to get all of our individual alignments, build a list of args, then feed em into the workers.
        # first let's parse the alignments and figure out how to divide this.
        # Wait, it's literally just iteration through a list.
        # anyway let's set up the big list o' rows.
        for alignment in alignment_files:
            futures.append(executor.submit(parse_single_pair_aln, alignment, annotations_dir)) # feed our single command the aln and the dir for the genomes
        # ok now let's gather each alignment and cat it onto the list of rows!
        with tqdm(total=len(futures)) as pbar:
            for future in as_completed(futures):
                try:
                    result = future.result()
                    if result[0] is True:
                        all_rows.append(result[2])
                    else:
                        no_homology.append(result[2])
                except Exception as e:
                    tqdm.write(f"Error: {e}")
                    tqdm.write(f"Error: {e}")# double it so we can keep a record on the screen.
                pbar.update(1)
                custom_write(result[1])
    return all_rows, no_homology

def custom_write(text):
    # Use the tqdm.write method to ensure that the progress bar does not get disrupted
    tqdm.write(text)
    # Move the cursor up one line and clear the line
    sys.stdout.write('\033[F\033[K')

def main():
    # Create the parser
    parser = argparse.ArgumentParser(description="A script to run mauve on a directory of assemblies against the B31 reference genome.")
    # Add the arguments
    parser.add_argument('alignments_dir',  type=str, help='The directory containing the alignments to parse')
    parser.add_argument('annotations_dir', type=str, help='The directory containing the annotations (genbanks)')
    parser.add_argument('output_dir',      type=str, help='The directory for outputs')
    parser.add_argument('cpus',            type=int, help='How many cores we rippin')
    # Parse the arguments
    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.mkdir(args.annotations_dir)
    
    alignment_files = get_alignment_files(args.alignments_dir)
    rows, no_homologies = parallel_parse(args.cpus, alignment_files, args.annotations_dir)
    output_alns = f'{output_dir}/ava_homo.tsv'
    output_no_aln = f'{output_dir}/ava_no_homo.tsv'
    write_rows(rows, output_alns)
    write_rows(no_homologies, output_no_aln)
    print('Finished!')
        

### Put excluded/bad functions here.

In [None]:

# This is a scuffed attempt and should not be implemented. 
#def get_position(length, start, end):
#    # okay so where are we on the contig?
#    if start == 0 and end <= length / 2:
#        position = "LE"
#    elif start <= 200 and end <= length/2:
#        position = "LS"
#    elif start >= 200 and end <= length/2:
#        position = "LS"
#    elif start >= length / 2 and end == length:
#        position = "RE"
#    elif start >= length / 2 and end >= length - 200:
#        position = "RS"
#    elif start >= length / 2 and end <= length - 200:
#        position = "RS"
#    elif start <= length / 2 and end >= length / 2:
#        position = "MID"
#    elif start == 0 and end == length:
#        position = "ENTIRE"
#    elif 200 >= start >= 0 and length-200 >= end >= length:
#        position = "NEARLY"
#    else:
#        position = f"out of bounds??{start}{end}"
#        print(start, end, " !!!!! ", length)
#    return position

#def check_location_of_alignment(length, coords):
#    # coordinate shenanigans ofc
#    positions = []
#    if len(coords) > 1:
#        gap_flag = '_*_'
#        for start, end in coords:
#            if start > end:
#                start, end = end, start
#                rev = '*'
#            else:
#                rev = ''
#            position = f'{rev}{get_position(length, start, end)}{rev}'
#            positions.append(position)
#        location = f'{gap_flag}'.join(positions)
#    else:
#        for start, end in coords:
#            if start > end:
#                start, end = end, start
#                rev = '*'
#            else:
#                rev = ''
#            position = get_position(length, start, end)
#            location = f'{rev}{position}{rev}'
#    return location

#def collapse_coords(coords):
#    # I do not think this is really required...
#    coords.sort(key=lambda x: x[0])
#    collapsed_ranges = []
#    cur_start, cur_end = coords[0]
#    for start, end in coords[1:]:
#        if start <= cur_end + 1:
#            cur_end = max(cur_end, end)
#        else:
#            collapsed_ranges.append((cur_start, cur_end))
#            cur_start, cur_end = start, end
#
#    collapsed_ranges.append((cur_start, cur_end))
#    return collapsed_ranges

## Let's parse our alignments

In [140]:
b31_alignments_dir = '/home/mf019/longread_pangenome/synteny/pgv_mummer_output_2'
annotations_dir = '/home/mf019/longread_pangenome/longread_analysis/paired_assemblies/paired_only/longread/annotation'
reference_genbank = '/home/mf019/longread_pangenome/synteny/renamed_GCF_000008685.2.gbff'
output_dir = '/home/mf019/longread_pangenome/synteny/parsing_output'
#run_all(b31_alignments_dir, annotations_dir, reference_genbank, output_dir)

parsing alignments for URI87H!
parsing alignments for URI34H!
parsing alignments for URI88H!
parsing alignments for URI33H!
parsing alignments for UCT110H!
parsing alignments for URI39H!
parsing alignments for URI91H!
parsing alignments for UCT35H!
parsing alignments for UWI247P!
parsing alignments for URI120H!
parsing alignments for URI107H!
parsing alignments for UWI263P!
parsing alignments for URI89H!
parsing alignments for URI42H!
parsing alignments for URI44H!
parsing alignments for UCT109H!
parsing alignments for URI40H!
parsing alignments for URI117H!
parsing alignments for URI47H!
parsing alignments for URI86H!
parsing alignments for URI36H!
parsing alignments for UNY208P!
parsing alignments for ESI26H!
parsing alignments for UCT31H!
parsing alignments for URI56H!
parsing alignments for UCT30H!
parsing alignments for URI103H!
parsing alignments for UCT29H!
parsing alignments for URI112H!
parsing alignments for UWI248P!
parsing alignments for UNY203P!
parsing alignments for UCT9

## Let's parse the big all v all alignments I did. 

In [13]:
import pandas
import pickle

In [14]:
ava_homo_simple_file = 'ava_wp_db/ava_homo_simple.tsv'
parsing_pkl = '/home/mf019/borrelia_plasmid_classifier_v3/parsing_tables/blast_parsing_dict.pkl'
with open(parsing_pkl, 'rb') as in_file:
    parsing_dict = pickle.load(in_file)
ava_homo = pandas.read_csv(ava_homo_simple_file, delimiter='\t')
ava_homo.head()

Unnamed: 0,contig,contig_len,total_contig_coverage,list_of_alignments(ref:contig_cov:location)
0,CP031405.1,19997,78.63,CP002320.1: 78.63%
1,CP019851.1,16820,4.1,CP074057.1: 4.10%
2,CP001458.1,38893,4.4,CP002315.1: 4.40%
3,CP001458.1,38893,3.15,CP017212.1: 3.15%
4,CP094610.1,29936,1.57,CP002314.1: 1.57%


In [20]:
ava_sorted = ava_homo.sort_values(by='contig')

In [21]:
ava_sorted.head()

Unnamed: 0,contig,contig_len,total_contig_coverage,list_of_alignments(ref:contig_cov:location)
15104,AE000783.1,910724,99.99,CP124092.1: 99.99%
4362,AE000783.1,910724,0.03,CP002325.1: 0.03%
8653,AE000783.1,910724,0.68,CP002306.1: 0.68%
13194,AE000783.1,910724,0.68,CP001273.1: 0.68%
4443,AE000783.1,910724,99.05,CP002228.1: 99.05%


In [22]:
ava_sorted['contig_name'] = ava_sorted['contig'].apply(lambda x: parsing_dict[x]['name'])

In [29]:
ava_sorted['aln_name'] = ava_sorted['list_of_alignments(ref:contig_cov:location)'].apply(lambda x: f'{parsing_dict[x.split(':')[0]]['name']} : {x.split(':')[1]}')
print(ava_sorted['aln_name'])

15104     chromosome :  99.99%
4362             lp38 :  0.03%
8653           lp28-1 :  0.68%
13194          lp28-1 :  0.68%
4443      chromosome :  99.05%
                 ...          
7854      chromosome :  99.98%
4816      chromosome :  99.98%
11806    chromosome :  100.03%
2223      chromosome :  99.96%
3550          lp28-11 :  0.11%
Name: aln_name, Length: 15209, dtype: object


In [31]:
pivot_table = ava_sorted.groupby('contig')['aln_name'].agg(list).reset_index()

In [33]:
pivot_table['contig_name'] = pivot_table['contig'].apply(lambda x: parsing_dict[x]['name'])

In [35]:
new_order = ['contig', 'contig_name', 'aln_name']
pivot_table = pivot_table[new_order]

In [37]:
pivot_table.to_csv('ava_wp_db/ava_homo_simple_merged.tsv', sep='\t')